mirror of
https://github.com/postgres/postgres.git
synced 2025-07-02 09:02:37 +03:00
Modify COPY TO to emit carriage returns and newlines as backslash escapes
(backslash-r, backslash-n) for protection against newline-conversion munging. In future we will also tweak COPY FROM, but this part of the change should be backwards-compatible. Per pghackers discussion. Also, update COPY reference page to describe the backslash conversions more completely and accurately.
This commit is contained in:
@ -1,5 +1,5 @@
|
|||||||
<!--
|
<!--
|
||||||
$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.27 2002/01/20 22:19:56 petere Exp $
|
$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.28 2002/02/12 21:25:34 tgl Exp $
|
||||||
PostgreSQL documentation
|
PostgreSQL documentation
|
||||||
-->
|
-->
|
||||||
|
|
||||||
@ -74,7 +74,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
|
|||||||
<term><replaceable class="parameter">filename</replaceable></term>
|
<term><replaceable class="parameter">filename</replaceable></term>
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
The absolute Unix file name of the input or output file.
|
The absolute Unix path name of the input or output file.
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
@ -225,7 +225,7 @@ ERROR: <replaceable>reason</replaceable>
|
|||||||
By default, a text copy uses a tab ("\t") character as a delimiter
|
By default, a text copy uses a tab ("\t") character as a delimiter
|
||||||
between fields. The field delimiter may be changed to any other single
|
between fields. The field delimiter may be changed to any other single
|
||||||
character with the keyword phrase USING DELIMITERS. Characters
|
character with the keyword phrase USING DELIMITERS. Characters
|
||||||
in data fields which happen to match the delimiter character will
|
in data fields that happen to match the delimiter character will
|
||||||
be backslash quoted.
|
be backslash quoted.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
@ -265,8 +265,8 @@ ERROR: <replaceable>reason</replaceable>
|
|||||||
by the <application>PostgreSQL</application> user (the user ID the
|
by the <application>PostgreSQL</application> user (the user ID the
|
||||||
server runs as), not the client.
|
server runs as), not the client.
|
||||||
<command>COPY</command> naming a file is only allowed to database
|
<command>COPY</command> naming a file is only allowed to database
|
||||||
superusers, since it allows writing on any file that the backend has
|
superusers, since it allows reading or writing any file that the backend
|
||||||
privileges to write on.
|
has privileges to access.
|
||||||
|
|
||||||
<tip>
|
<tip>
|
||||||
<para>
|
<para>
|
||||||
@ -297,57 +297,109 @@ ERROR: <replaceable>reason</replaceable>
|
|||||||
<title>File Formats</title>
|
<title>File Formats</title>
|
||||||
<refsect2>
|
<refsect2>
|
||||||
<refsect2info>
|
<refsect2info>
|
||||||
<date>2001-01-02</date>
|
<date>2002-02-12</date>
|
||||||
</refsect2info>
|
</refsect2info>
|
||||||
<title>Text Format</title>
|
<title>Text Format</title>
|
||||||
<para>
|
<para>
|
||||||
When <command>COPY TO</command> is used without the BINARY option,
|
When <command>COPY</command> is used without the BINARY option,
|
||||||
the file generated will have each row (instance) on a single line, with each
|
the file read or written is a text file with one line per table row.
|
||||||
column (attribute) separated by the delimiter character. Embedded
|
Columns (attributes) in a row are separated by the delimiter character.
|
||||||
delimiter characters will be preceded by a backslash character
|
The attribute values themselves are strings generated by the
|
||||||
("\"). The attribute values themselves are strings generated by the
|
output function, or acceptable to the input function, of each
|
||||||
output function associated with each attribute type. The output
|
attribute's data type. The specified null-value string is used in
|
||||||
function for a type should not try to generate the backslash
|
place of attributes that are NULL.
|
||||||
character; this will be handled by <command>COPY</command> itself.
|
</para>
|
||||||
|
<para>
|
||||||
|
If WITH OIDS is specified, the OID is read or written as the first column,
|
||||||
|
preceding the user data columns. (An error is raised if WITH OIDS is
|
||||||
|
specified for a table that does not have OIDs.)
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
End of data can be represented by a single line containing just
|
||||||
|
backslash-period (<literal>\.</>). An end-of-data marker is
|
||||||
|
not necessary when reading from a Unix file, since the end of file
|
||||||
|
serves perfectly well; but an end marker must be provided when copying
|
||||||
|
data to or from a client application.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
Backslash characters (<literal>\</>) may be used in the
|
||||||
|
<command>COPY</command> data to quote data characters that might otherwise
|
||||||
|
be taken as row or column delimiters. In particular, the following
|
||||||
|
characters <emphasis>must</> be preceded by a backslash if they appear
|
||||||
|
as part of an attribute value: backslash itself, newline, and the current
|
||||||
|
delimiter character.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
The following special backslash sequences are recognized by
|
||||||
|
<command>COPY FROM</command>:
|
||||||
|
|
||||||
|
<informaltable>
|
||||||
|
<tgroup cols="2">
|
||||||
|
<thead>
|
||||||
|
<row>
|
||||||
|
<entry>Sequence</entry>
|
||||||
|
<entry>Represents</entry>
|
||||||
|
</row>
|
||||||
|
</thead>
|
||||||
|
|
||||||
|
<tbody>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\b</></entry>
|
||||||
|
<entry>Backspace (ASCII 8)</entry>
|
||||||
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\f</></entry>
|
||||||
|
<entry>Form feed (ASCII 12)</entry>
|
||||||
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\n</></entry>
|
||||||
|
<entry>Newline (ASCII 10)</entry>
|
||||||
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\r</></entry>
|
||||||
|
<entry>Carriage return (ASCII 13)</entry>
|
||||||
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\t</></entry>
|
||||||
|
<entry>Tab (ASCII 9)</entry>
|
||||||
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\v</></entry>
|
||||||
|
<entry>Vertical tab (ASCII 11)</entry>
|
||||||
|
</row>
|
||||||
|
<row>
|
||||||
|
<entry><literal>\</><replaceable>digits</></entry>
|
||||||
|
<entry>Backslash followed by one to three octal digits specifies
|
||||||
|
the character with that numeric code</entry>
|
||||||
|
</row>
|
||||||
|
</tbody>
|
||||||
|
</tgroup>
|
||||||
|
</informaltable>
|
||||||
|
|
||||||
|
Presently, <command>COPY TO</command> will never emit an octal-digits
|
||||||
|
backslash sequence, but it does use the other sequences listed above
|
||||||
|
for those control characters.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
Never put a backslash before a data character <literal>N</> or period
|
||||||
|
(<literal>.</>). Such pairs will be mistaken for the default null string
|
||||||
|
or the end-of-data marker, respectively. Any other backslashed character
|
||||||
|
that is not mentioned in the above table will be taken to represent itself.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
It is strongly recommended that applications generating COPY data convert
|
||||||
|
data newlines and carriage returns to the <literal>\n</> and
|
||||||
|
<literal>\r</> sequences respectively. At present
|
||||||
|
(<productname>PostgreSQL</productname> 7.2 and older versions) it is
|
||||||
|
possible to represent a data carriage return without any special quoting,
|
||||||
|
and to represent a data newline by a backslash and newline. However,
|
||||||
|
these representations will not be accepted by default in future releases.
|
||||||
</para>
|
</para>
|
||||||
<para>
|
<para>
|
||||||
The actual format for each instance is
|
|
||||||
<programlisting>
|
|
||||||
<attr1><<replaceable class=parameter>separator</replaceable>><attr2><<replaceable class=parameter>separator</replaceable>>...<<replaceable class=parameter>separator</replaceable>><attr<replaceable class="parameter">n</replaceable>><newline>
|
|
||||||
</programlisting>
|
|
||||||
Note that the end of each row is marked by a Unix-style newline
|
Note that the end of each row is marked by a Unix-style newline
|
||||||
("\n"). <command>COPY FROM</command> will not behave as desired
|
("\n"). Presently, <command>COPY FROM</command> will not behave as
|
||||||
if given a file containing DOS- or Mac-style newlines.
|
desired if given a file containing DOS- or Mac-style newlines.
|
||||||
</para>
|
This is expected to change in future releases.
|
||||||
<para>
|
|
||||||
The OID is emitted as the first column if WITH OIDS is specified.
|
|
||||||
(An error is raised if WITH OIDS is specified for a table that does not
|
|
||||||
have OIDs.)
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
If <command>COPY TO</command> is sending its output to standard
|
|
||||||
output instead of a file, after the last row it will send a backslash ("\")
|
|
||||||
and a period (".") followed by a newline.
|
|
||||||
Similarly, if <command>COPY FROM</command> is reading
|
|
||||||
from standard input, it will expect a backslash ("\") and a period
|
|
||||||
(".") followed by a newline, as the first three characters on a
|
|
||||||
line to denote end-of-file. However, <command>COPY FROM</command>
|
|
||||||
will terminate correctly (followed by the backend itself) if the
|
|
||||||
input connection is closed before this special end-of-file pattern is
|
|
||||||
found.
|
|
||||||
</para>
|
|
||||||
<para>
|
|
||||||
The backslash character has other special meanings. A literal backslash
|
|
||||||
character is represented as two
|
|
||||||
consecutive backslashes ("\\"). A literal tab character is represented
|
|
||||||
as a backslash and a tab. (If you are using something other than tab
|
|
||||||
as the column delimiter, backslash that delimiter character to include
|
|
||||||
it in data.) A literal newline character is
|
|
||||||
represented as a backslash and a newline. When loading text data
|
|
||||||
not generated by <application>PostgreSQL</application>,
|
|
||||||
you will need to convert backslash
|
|
||||||
characters ("\") to double-backslashes ("\\") to ensure that they
|
|
||||||
are loaded properly.
|
|
||||||
</para>
|
</para>
|
||||||
</refsect2>
|
</refsect2>
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.144 2001/12/04 21:19:57 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.145 2002/02/12 21:25:41 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -41,7 +41,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
|
#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
|
||||||
#define VALUE(c) ((c) - '0')
|
#define OCTVALUE(c) ((c) - '0')
|
||||||
|
|
||||||
|
|
||||||
/* non-export function prototypes */
|
/* non-export function prototypes */
|
||||||
@ -83,13 +83,13 @@ static int server_encoding;
|
|||||||
* Internal communications functions
|
* Internal communications functions
|
||||||
*/
|
*/
|
||||||
static void CopySendData(void *databuf, int datasize, FILE *fp);
|
static void CopySendData(void *databuf, int datasize, FILE *fp);
|
||||||
static void CopySendString(char *str, FILE *fp);
|
static void CopySendString(const char *str, FILE *fp);
|
||||||
static void CopySendChar(char c, FILE *fp);
|
static void CopySendChar(char c, FILE *fp);
|
||||||
static void CopyGetData(void *databuf, int datasize, FILE *fp);
|
static void CopyGetData(void *databuf, int datasize, FILE *fp);
|
||||||
static int CopyGetChar(FILE *fp);
|
static int CopyGetChar(FILE *fp);
|
||||||
static int CopyGetEof(FILE *fp);
|
static int CopyGetEof(FILE *fp);
|
||||||
static int CopyPeekChar(FILE *fp);
|
static int CopyPeekChar(FILE *fp);
|
||||||
static void CopyDonePeek(FILE *fp, int c, int pickup);
|
static void CopyDonePeek(FILE *fp, int c, bool pickup);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CopySendData sends output data either to the file
|
* CopySendData sends output data either to the file
|
||||||
@ -118,9 +118,9 @@ CopySendData(void *databuf, int datasize, FILE *fp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
CopySendString(char *str, FILE *fp)
|
CopySendString(const char *str, FILE *fp)
|
||||||
{
|
{
|
||||||
CopySendData(str, strlen(str), fp);
|
CopySendData((void *) str, strlen(str), fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -178,10 +178,12 @@ CopyGetEof(FILE *fp)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* CopyPeekChar reads a byte in "peekable" mode.
|
* CopyPeekChar reads a byte in "peekable" mode.
|
||||||
|
*
|
||||||
* after each call to CopyPeekChar, a call to CopyDonePeek _must_
|
* after each call to CopyPeekChar, a call to CopyDonePeek _must_
|
||||||
* follow, unless EOF was returned.
|
* follow, unless EOF was returned.
|
||||||
* CopyDonePeek will either take the peeked char off the steam
|
*
|
||||||
* (if pickup is != 0) or leave it on the stream (if pickup == 0)
|
* CopyDonePeek will either take the peeked char off the stream
|
||||||
|
* (if pickup is true) or leave it on the stream (if pickup is false).
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
CopyPeekChar(FILE *fp)
|
CopyPeekChar(FILE *fp)
|
||||||
@ -199,15 +201,13 @@ CopyPeekChar(FILE *fp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
CopyDonePeek(FILE *fp, int c, int pickup)
|
CopyDonePeek(FILE *fp, int c, bool pickup)
|
||||||
{
|
{
|
||||||
if (!fp)
|
if (!fp)
|
||||||
{
|
{
|
||||||
if (pickup)
|
if (pickup)
|
||||||
{
|
{
|
||||||
/*
|
/* We want to pick it up */
|
||||||
* We want to pick it up
|
|
||||||
*/
|
|
||||||
(void) pq_getbyte();
|
(void) pq_getbyte();
|
||||||
}
|
}
|
||||||
/* If we didn't want to pick it up, just leave it where it sits */
|
/* If we didn't want to pick it up, just leave it where it sits */
|
||||||
@ -219,7 +219,7 @@ CopyDonePeek(FILE *fp, int c, int pickup)
|
|||||||
/* We don't want to pick it up - so put it back in there */
|
/* We don't want to pick it up - so put it back in there */
|
||||||
ungetc(c, fp);
|
ungetc(c, fp);
|
||||||
}
|
}
|
||||||
/* If we wanted to pick it up, it's already there */
|
/* If we wanted to pick it up, it's already done */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1078,31 +1078,30 @@ CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_
|
|||||||
{
|
{
|
||||||
int val;
|
int val;
|
||||||
|
|
||||||
val = VALUE(c);
|
val = OCTVALUE(c);
|
||||||
c = CopyPeekChar(fp);
|
c = CopyPeekChar(fp);
|
||||||
if (ISOCTAL(c))
|
if (ISOCTAL(c))
|
||||||
{
|
{
|
||||||
val = (val << 3) + VALUE(c);
|
val = (val << 3) + OCTVALUE(c);
|
||||||
CopyDonePeek(fp, c, 1); /* Pick up the
|
CopyDonePeek(fp, c, true /*pick up*/);
|
||||||
* character! */
|
|
||||||
c = CopyPeekChar(fp);
|
c = CopyPeekChar(fp);
|
||||||
if (ISOCTAL(c))
|
if (ISOCTAL(c))
|
||||||
{
|
{
|
||||||
CopyDonePeek(fp, c, 1); /* pick up! */
|
val = (val << 3) + OCTVALUE(c);
|
||||||
val = (val << 3) + VALUE(c);
|
CopyDonePeek(fp, c, true /*pick up*/);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (c == EOF)
|
if (c == EOF)
|
||||||
goto endOfFile;
|
goto endOfFile;
|
||||||
CopyDonePeek(fp, c, 0); /* Return to stream! */
|
CopyDonePeek(fp, c, false /*put back*/);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (c == EOF)
|
if (c == EOF)
|
||||||
goto endOfFile;
|
goto endOfFile;
|
||||||
CopyDonePeek(fp, c, 0); /* Return to stream! */
|
CopyDonePeek(fp, c, false /*put back*/);
|
||||||
}
|
}
|
||||||
c = val & 0377;
|
c = val & 0377;
|
||||||
}
|
}
|
||||||
@ -1144,6 +1143,7 @@ CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_
|
|||||||
}
|
}
|
||||||
appendStringInfoCharMacro(&attribute_buf, c);
|
appendStringInfoCharMacro(&attribute_buf, c);
|
||||||
#ifdef MULTIBYTE
|
#ifdef MULTIBYTE
|
||||||
|
/* XXX shouldn't this be done even when encoding is the same? */
|
||||||
if (client_encoding != server_encoding)
|
if (client_encoding != server_encoding)
|
||||||
{
|
{
|
||||||
/* get additional bytes of the char, if any */
|
/* get additional bytes of the char, if any */
|
||||||
@ -1190,15 +1190,18 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
|
|||||||
{
|
{
|
||||||
char *string;
|
char *string;
|
||||||
char c;
|
char c;
|
||||||
|
char delimc = delim[0];
|
||||||
|
|
||||||
#ifdef MULTIBYTE
|
#ifdef MULTIBYTE
|
||||||
|
bool same_encoding;
|
||||||
char *string_start;
|
char *string_start;
|
||||||
int mblen;
|
int mblen;
|
||||||
int i;
|
int i;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef MULTIBYTE
|
#ifdef MULTIBYTE
|
||||||
if (client_encoding != server_encoding)
|
same_encoding = (server_encoding == client_encoding);
|
||||||
|
if (!same_encoding)
|
||||||
{
|
{
|
||||||
string = (char *) pg_server_to_client((unsigned char *) server_string,
|
string = (char *) pg_server_to_client((unsigned char *) server_string,
|
||||||
strlen(server_string));
|
strlen(server_string));
|
||||||
@ -1207,31 +1210,64 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
string = server_string;
|
string = server_string;
|
||||||
string_start = NULL; /* unused, but keep compiler quiet */
|
string_start = NULL;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
string = server_string;
|
string = server_string;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef MULTIBYTE
|
#ifdef MULTIBYTE
|
||||||
for (; (mblen = (server_encoding == client_encoding ? 1 : pg_encoding_mblen(client_encoding, string))) &&
|
for (; (c = *string) != '\0'; string += mblen)
|
||||||
((c = *string) != '\0'); string += mblen)
|
|
||||||
#else
|
#else
|
||||||
for (; (c = *string) != '\0'; string++)
|
for (; (c = *string) != '\0'; string++)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (c == delim[0] || c == '\n' || c == '\\')
|
|
||||||
CopySendChar('\\', fp);
|
|
||||||
#ifdef MULTIBYTE
|
#ifdef MULTIBYTE
|
||||||
for (i = 0; i < mblen; i++)
|
mblen = 1;
|
||||||
CopySendChar(*(string + i), fp);
|
|
||||||
#else
|
|
||||||
CopySendChar(c, fp);
|
|
||||||
#endif
|
#endif
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case '\b':
|
||||||
|
CopySendString("\\b", fp);
|
||||||
|
break;
|
||||||
|
case '\f':
|
||||||
|
CopySendString("\\f", fp);
|
||||||
|
break;
|
||||||
|
case '\n':
|
||||||
|
CopySendString("\\n", fp);
|
||||||
|
break;
|
||||||
|
case '\r':
|
||||||
|
CopySendString("\\r", fp);
|
||||||
|
break;
|
||||||
|
case '\t':
|
||||||
|
CopySendString("\\t", fp);
|
||||||
|
break;
|
||||||
|
case '\v':
|
||||||
|
CopySendString("\\v", fp);
|
||||||
|
break;
|
||||||
|
case '\\':
|
||||||
|
CopySendString("\\\\", fp);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (c == delimc)
|
||||||
|
CopySendChar('\\', fp);
|
||||||
|
CopySendChar(c, fp);
|
||||||
|
#ifdef MULTIBYTE
|
||||||
|
/* XXX shouldn't this be done even when encoding is same? */
|
||||||
|
if (!same_encoding)
|
||||||
|
{
|
||||||
|
/* send additional bytes of the char, if any */
|
||||||
|
mblen = pg_encoding_mblen(client_encoding, string);
|
||||||
|
for (i = 1; i < mblen; i++)
|
||||||
|
CopySendChar(string[i], fp);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MULTIBYTE
|
#ifdef MULTIBYTE
|
||||||
if (client_encoding != server_encoding)
|
if (string_start)
|
||||||
pfree(string_start); /* pfree pg_server_to_client result */
|
pfree(string_start); /* pfree pg_server_to_client result */
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user