1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-30 11:03:19 +03:00

Add header matching mode to COPY FROM

COPY FROM supports the HEADER option to silently discard the header
line from a CSV or text file.  It is possible to load by mistake a
file that matches the expected format, for example, if two text
columns have been swapped, resulting in garbage in the database.

This adds a new option value HEADER MATCH that checks the column names
in the header line against the actual column names and errors out if
they do not match.

Author: Rémi Lapeyre <remi.lapeyre@lenstra.fr>
Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com>
Discussion: https://www.postgresql.org/message-id/flat/CAF1-J-0PtCWMeLtswwGV2M70U26n4g33gpe1rcKQqe6wVQDrFA@mail.gmail.com
This commit is contained in:
Peter Eisentraut
2022-03-30 08:56:58 +02:00
parent edcedcc2c7
commit 072132f04e
8 changed files with 210 additions and 8 deletions

View File

@ -313,6 +313,64 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt,
table_close(rel, NoLock);
}
/*
* Extract a CopyHeaderChoice value from a DefElem. This is like
* defGetBoolean() but also accepts the special value "match".
*/
static CopyHeaderChoice
defGetCopyHeaderChoice(DefElem *def)
{
/*
* If no parameter given, assume "true" is meant.
*/
if (def->arg == NULL)
return COPY_HEADER_TRUE;
/*
* Allow 0, 1, "true", "false", "on", "off", or "match".
*/
switch (nodeTag(def->arg))
{
case T_Integer:
switch (intVal(def->arg))
{
case 0:
return COPY_HEADER_FALSE;
case 1:
return COPY_HEADER_TRUE;
default:
/* otherwise, error out below */
break;
}
break;
default:
{
char *sval = defGetString(def);
/*
* The set of strings accepted here should match up with the
* grammar's opt_boolean_or_string production.
*/
if (pg_strcasecmp(sval, "true") == 0)
return COPY_HEADER_TRUE;
if (pg_strcasecmp(sval, "false") == 0)
return COPY_HEADER_FALSE;
if (pg_strcasecmp(sval, "on") == 0)
return COPY_HEADER_TRUE;
if (pg_strcasecmp(sval, "off") == 0)
return COPY_HEADER_FALSE;
if (pg_strcasecmp(sval, "match") == 0)
return COPY_HEADER_MATCH;
}
break;
}
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("%s requires a Boolean value or \"match\"",
def->defname)));
return COPY_HEADER_FALSE; /* keep compiler quiet */
}
/*
* Process the statement option list for COPY.
*
@ -394,7 +452,7 @@ ProcessCopyOptions(ParseState *pstate,
if (header_specified)
errorConflictingDefElem(defel, pstate);
header_specified = true;
opts_out->header_line = defGetBoolean(defel);
opts_out->header_line = defGetCopyHeaderChoice(defel);
}
else if (strcmp(defel->defname, "quote") == 0)
{

View File

@ -72,6 +72,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_bswap.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/rel.h"
@ -758,12 +759,58 @@ NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
/* only available for text or csv input */
Assert(!cstate->opts.binary);
/* on input just throw the header line away */
/* on input check that the header line is correct if needed */
if (cstate->cur_lineno == 0 && cstate->opts.header_line)
{
ListCell *cur;
TupleDesc tupDesc;
tupDesc = RelationGetDescr(cstate->rel);
cstate->cur_lineno++;
if (CopyReadLine(cstate))
return false; /* done */
done = CopyReadLine(cstate);
if (cstate->opts.header_line == COPY_HEADER_MATCH)
{
int fldnum;
if (cstate->opts.csv_mode)
fldct = CopyReadAttributesCSV(cstate);
else
fldct = CopyReadAttributesText(cstate);
if (fldct != list_length(cstate->attnumlist))
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("wrong number of fields in header line: field count is %d, expected %d",
fldct, list_length(cstate->attnumlist))));
fldnum = 0;
foreach(cur, cstate->attnumlist)
{
int attnum = lfirst_int(cur);
char *colName = cstate->raw_fields[attnum - 1];
Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
fldnum++;
if (colName == NULL)
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
fldnum, cstate->opts.null_print, NameStr(attr->attname))));
if (namestrcmp(&attr->attname, colName) != 0) {
ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
fldnum, colName, NameStr(attr->attname))));
}
}
}
if (done)
return false;
}
cstate->cur_lineno++;

View File

@ -19,6 +19,17 @@
#include "parser/parse_node.h"
#include "tcop/dest.h"
/*
* Represents whether a header line should be present, and whether it must
* match the actual names (which implies "true").
*/
typedef enum CopyHeaderChoice
{
COPY_HEADER_FALSE = 0,
COPY_HEADER_TRUE,
COPY_HEADER_MATCH,
} CopyHeaderChoice;
/*
* A struct to hold COPY options, in a parsed form. All of these are related
* to formatting, except for 'freeze', which doesn't really belong here, but
@ -32,7 +43,7 @@ typedef struct CopyFormatOptions
bool binary; /* binary format? */
bool freeze; /* freeze rows on loading? */
bool csv_mode; /* Comma Separated Value format? */
bool header_line; /* header line? */
CopyHeaderChoice header_line; /* header line? */
char *null_print; /* NULL marker string (server encoding!) */
int null_print_len; /* length of same */
char *null_print_client; /* same converted to file encoding */

View File

@ -176,3 +176,26 @@ INFO: progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progre
drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
drop function notice_after_tab_progress_reporting();
drop table tab_progress_reporting;
-- Test header matching feature
create table header_copytest (
a int,
b int,
c text
);
copy header_copytest from stdin with (header wrong_choice);
ERROR: header requires a Boolean value or "match"
copy header_copytest from stdin with (header match);
copy header_copytest from stdin with (header match);
ERROR: column name mismatch in header line field 3: got null value ("\N"), expected "c"
CONTEXT: COPY header_copytest, line 1: "a b \N"
copy header_copytest from stdin with (header match);
ERROR: wrong number of fields in header line: field count is 2, expected 3
CONTEXT: COPY header_copytest, line 1: "a b"
copy header_copytest from stdin with (header match);
ERROR: wrong number of fields in header line: field count is 4, expected 3
CONTEXT: COPY header_copytest, line 1: "a b c d"
copy header_copytest from stdin with (header match);
ERROR: column name mismatch in header line field 3: got "d", expected "c"
CONTEXT: COPY header_copytest, line 1: "a b d"
copy header_copytest from stdin with (header match, format csv);
drop table header_copytest;

View File

@ -197,3 +197,36 @@ copy tab_progress_reporting from :'filename'
drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
drop function notice_after_tab_progress_reporting();
drop table tab_progress_reporting;
-- Test header matching feature
create table header_copytest (
a int,
b int,
c text
);
copy header_copytest from stdin with (header wrong_choice);
copy header_copytest from stdin with (header match);
a b c
1 2 foo
\.
copy header_copytest from stdin with (header match);
a b \N
1 2 foo
\.
copy header_copytest from stdin with (header match);
a b
1 2
\.
copy header_copytest from stdin with (header match);
a b c d
1 2 foo bar
\.
copy header_copytest from stdin with (header match);
a b d
1 2 foo
\.
copy header_copytest from stdin with (header match, format csv);
a,b,c
1,2,foo
\.
drop table header_copytest;