diff options
-rw-r--r-- | contrib/file_fdw/file_fdw.c | 1 | ||||
-rw-r--r-- | doc/src/sgml/file-fdw.sgml | 11 | ||||
-rw-r--r-- | doc/src/sgml/ref/copy.sgml | 20 | ||||
-rw-r--r-- | src/backend/commands/copy.c | 60 | ||||
-rw-r--r-- | src/backend/parser/gram.y | 4 | ||||
-rw-r--r-- | src/backend/utils/mb/mbutils.c | 46 | ||||
-rw-r--r-- | src/include/mb/pg_wchar.h | 2 | ||||
-rw-r--r-- | src/test/regress/expected/copy2.out | 6 | ||||
-rw-r--r-- | src/test/regress/sql/copy2.sql | 6 |
9 files changed, 119 insertions, 37 deletions
diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index 265afb5d9bc..6a84a00e8d3 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -55,6 +55,7 @@ static struct FileFdwOption valid_options[] = { { "quote", ForeignTableRelationId }, { "escape", ForeignTableRelationId }, { "null", ForeignTableRelationId }, + { "encoding", ForeignTableRelationId }, /* * force_quote is not supported by file_fdw because it's for COPY TO. diff --git a/doc/src/sgml/file-fdw.sgml b/doc/src/sgml/file-fdw.sgml index e2921667184..003c415b43a 100644 --- a/doc/src/sgml/file-fdw.sgml +++ b/doc/src/sgml/file-fdw.sgml @@ -97,6 +97,17 @@ </listitem> </varlistentry> + <varlistentry> + <term><literal>encoding</literal></term> + + <listitem> + <para> + Specifies the file's encoding. + the same as <command>COPY</>'s <literal>ENCODING</literal> option. + </para> + </listitem> + </varlistentry> + </variablelist> <para> diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 38424ad04b9..6429a4ef0d7 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -40,7 +40,8 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable QUOTE '<replaceable class="parameter">quote_character</replaceable>' ESCAPE '<replaceable class="parameter">escape_character</replaceable>' FORCE_QUOTE { ( <replaceable class="parameter">column</replaceable> [, ...] ) | * } - FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] ) + FORCE_NOT_NULL ( <replaceable class="parameter">column</replaceable> [, ...] ) | + ENCODING '<replaceable class="parameter">encoding_name</replaceable>' </synopsis> </refsynopsisdiv> @@ -282,6 +283,18 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable </listitem> </varlistentry> + <varlistentry> + <term><literal>ENCODING</></term> + <listitem> + <para> + Specifies that the file is encoded in the <replaceable + class="parameter">encoding_name</replaceable>. If this option is + omitted, the current client encoding is used. See the Notes below + for more details. + </para> + </listitem> + </varlistentry> + </variablelist> </refsect1> @@ -377,8 +390,9 @@ COPY <replaceable class="parameter">count</replaceable> </para> <para> - Input data is interpreted according to the current client encoding, - and output data is encoded in the current client encoding, even + Input data is interpreted according to <literal>ENCODING</literal> + option or the current client encoding, and output data is encoded + in <literal>ENCODING</literal> or the current client encoding, even if the data does not pass through the client but is read from or written to a file directly by the server. </para> diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 294450ef660..cac11a6c641 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -95,8 +95,8 @@ typedef struct CopyStateData * dest == COPY_NEW_FE in COPY FROM */ bool fe_eof; /* true if detected end of copy data */ EolType eol_type; /* EOL type of input */ - int client_encoding; /* remote side's character encoding */ - bool need_transcoding; /* client encoding diff from server? */ + int file_encoding; /* file or remote side's character encoding */ + bool need_transcoding; /* file encoding diff from server? */ bool encoding_embeds_ascii; /* ASCII can be non-first byte? */ /* parameters from the COPY command */ @@ -110,7 +110,7 @@ typedef struct CopyStateData bool header_line; /* CSV header line? */ char *null_print; /* NULL marker string (server encoding!) */ int null_print_len; /* length of same */ - char *null_print_client; /* same converted to client encoding */ + char *null_print_client; /* same converted to file encoding */ char *delim; /* column delimiter (must be 1 byte) */ char *quote; /* CSV quote char (must be 1 byte) */ char *escape; /* CSV escape char (must be 1 byte) */ @@ -845,6 +845,8 @@ ProcessCopyOptions(CopyState cstate, if (cstate == NULL) cstate = (CopyStateData *) palloc0(sizeof(CopyStateData)); + cstate->file_encoding = -1; + /* Extract options from the statement node tree */ foreach(option, options) { @@ -948,6 +950,19 @@ ProcessCopyOptions(CopyState cstate, errmsg("argument to option \"%s\" must be a list of column names", defel->defname))); } + else if (strcmp(defel->defname, "encoding") == 0) + { + if (cstate->file_encoding >= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + cstate->file_encoding = pg_char_to_encoding(defGetString(defel)); + if (cstate->file_encoding < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("argument to option \"%s\" must be a valid encoding name", + defel->defname))); + } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -1278,17 +1293,20 @@ BeginCopy(bool is_from, } } + /* Use client encoding when ENCODING option is not specified. */ + if (cstate->file_encoding < 0) + cstate->file_encoding = pg_get_client_encoding(); + /* - * Set up encoding conversion info. Even if the client and server - * encodings are the same, we must apply pg_client_to_server() to validate + * Set up encoding conversion info. Even if the file and server + * encodings are the same, we must apply pg_any_to_server() to validate * data in multibyte encodings. */ - cstate->client_encoding = pg_get_client_encoding(); cstate->need_transcoding = - (cstate->client_encoding != GetDatabaseEncoding() || + (cstate->file_encoding != GetDatabaseEncoding() || pg_database_encoding_max_length() > 1); /* See Multibyte encoding comment above */ - cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding); + cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding); cstate->copy_dest = COPY_FILE; /* default */ @@ -1526,12 +1544,13 @@ CopyTo(CopyState cstate) else { /* - * For non-binary copy, we need to convert null_print to client + * For non-binary copy, we need to convert null_print to file * encoding, because it will be sent directly with CopySendString. */ if (cstate->need_transcoding) - cstate->null_print_client = pg_server_to_client(cstate->null_print, - cstate->null_print_len); + cstate->null_print_client = pg_server_to_any(cstate->null_print, + cstate->null_print_len, + cstate->file_encoding); /* if a header has been requested send the line */ if (cstate->header_line) @@ -2608,8 +2627,9 @@ CopyReadLine(CopyState cstate) { char *cvt; - cvt = pg_client_to_server(cstate->line_buf.data, - cstate->line_buf.len); + cvt = pg_any_to_server(cstate->line_buf.data, + cstate->line_buf.len, + cstate->file_encoding); if (cvt != cstate->line_buf.data) { /* transfer converted data back to line_buf */ @@ -2854,7 +2874,7 @@ CopyReadLineText(CopyState cstate) /* ----- * get next character * Note: we do not change c so if it isn't \., we can fall - * through and continue processing for client encoding. + * through and continue processing for file encoding. * ----- */ c2 = copy_raw_buf[raw_buf_ptr]; @@ -2968,7 +2988,7 @@ not_end_of_copy: mblen_str[0] = c; /* All our encodings only read the first byte to get the length */ - mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str); + mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str); IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1); IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1); raw_buf_ptr += mblen - 1; @@ -3467,7 +3487,7 @@ CopyAttributeOutText(CopyState cstate, char *string) char delimc = cstate->delim[0]; if (cstate->need_transcoding) - ptr = pg_server_to_client(string, strlen(string)); + ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding); else ptr = string; @@ -3540,7 +3560,7 @@ CopyAttributeOutText(CopyState cstate, char *string) start = ptr++; /* we include char in next run */ } else if (IS_HIGHBIT_SET(c)) - ptr += pg_encoding_mblen(cstate->client_encoding, ptr); + ptr += pg_encoding_mblen(cstate->file_encoding, ptr); else ptr++; } @@ -3627,7 +3647,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string, use_quote = true; if (cstate->need_transcoding) - ptr = pg_server_to_client(string, strlen(string)); + ptr = pg_server_to_any(string, strlen(string), cstate->file_encoding); else ptr = string; @@ -3654,7 +3674,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string, break; } if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii) - tptr += pg_encoding_mblen(cstate->client_encoding, tptr); + tptr += pg_encoding_mblen(cstate->file_encoding, tptr); else tptr++; } @@ -3678,7 +3698,7 @@ CopyAttributeOutCSV(CopyState cstate, char *string, start = ptr; /* we include char in next run */ } if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii) - ptr += pg_encoding_mblen(cstate->client_encoding, ptr); + ptr += pg_encoding_mblen(cstate->file_encoding, ptr); else ptr++; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c6811a11bd1..cbfacec4495 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -2236,6 +2236,10 @@ copy_opt_item: { $$ = makeDefElem("force_not_null", (Node *)$4); } + | ENCODING Sconst + { + $$ = makeDefElem("encoding", (Node *)makeString($2)); + } ; /* The following exist for backward compatibility with very old versions */ diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 5ee74f747d0..b8a2728e4f5 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -497,14 +497,25 @@ pg_encoding_max_length_sql(PG_FUNCTION_ARGS) char * pg_client_to_server(const char *s, int len) { + Assert(ClientEncoding); + + return pg_any_to_server(s, len, ClientEncoding->encoding); +} + +/* + * convert any encoding to server encoding. + */ +char * +pg_any_to_server(const char *s, int len, int encoding) +{ Assert(DatabaseEncoding); Assert(ClientEncoding); if (len <= 0) return (char *) s; - if (ClientEncoding->encoding == DatabaseEncoding->encoding || - ClientEncoding->encoding == PG_SQL_ASCII) + if (encoding == DatabaseEncoding->encoding || + encoding == PG_SQL_ASCII) { /* * No conversion is needed, but we must still validate the data. @@ -524,8 +535,8 @@ pg_client_to_server(const char *s, int len) * to the parser but we have no way to convert it. We compromise by * rejecting the data if it contains any non-ASCII characters. */ - if (PG_VALID_BE_ENCODING(ClientEncoding->encoding)) - (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false); + if (PG_VALID_BE_ENCODING(encoding)) + (void) pg_verify_mbstr(encoding, s, len, false); else { int i; @@ -543,7 +554,11 @@ pg_client_to_server(const char *s, int len) return (char *) s; } - return perform_default_encoding_conversion(s, len, true); + if (ClientEncoding->encoding == encoding) + return perform_default_encoding_conversion(s, len, true); + else + return (char *) pg_do_encoding_conversion( + (unsigned char *) s, len, encoding, DatabaseEncoding->encoding); } /* @@ -552,18 +567,33 @@ pg_client_to_server(const char *s, int len) char * pg_server_to_client(const char *s, int len) { + Assert(ClientEncoding); + + return pg_any_to_server(s, len, ClientEncoding->encoding); +} + +/* + * convert server encoding to any encoding. + */ +char * +pg_server_to_any(const char *s, int len, int encoding) +{ Assert(DatabaseEncoding); Assert(ClientEncoding); if (len <= 0) return (char *) s; - if (ClientEncoding->encoding == DatabaseEncoding->encoding || - ClientEncoding->encoding == PG_SQL_ASCII || + if (encoding == DatabaseEncoding->encoding || + encoding == PG_SQL_ASCII || DatabaseEncoding->encoding == PG_SQL_ASCII) return (char *) s; /* assume data is valid */ - return perform_default_encoding_conversion(s, len, false); + if (ClientEncoding->encoding == encoding) + return perform_default_encoding_conversion(s, len, false); + else + return (char *) pg_do_encoding_conversion( + (unsigned char *) s, len, DatabaseEncoding->encoding, encoding); } /* diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 565b53b3e6e..85a7b2f87dd 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -420,6 +420,8 @@ extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, extern char *pg_client_to_server(const char *s, int len); extern char *pg_server_to_client(const char *s, int len); +extern char *pg_any_to_server(const char *s, int len, int encoding); +extern char *pg_server_to_any(const char *s, int len, int encoding); extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 15cbe029770..8e2bc0c2504 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -46,10 +46,10 @@ CONTEXT: COPY x, line 1: "2001 231 \N \N" COPY x from stdin; ERROR: extra data after last expected column CONTEXT: COPY x, line 1: "2002 232 40 50 60 70 80" --- various COPY options: delimiters, oids, NULL string +-- various COPY options: delimiters, oids, NULL string, encoding COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x'; COPY x from stdin WITH DELIMITER AS ';' NULL AS ''; -COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X'; +COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii'; -- check results of copy in SELECT * FROM x; a | b | c | d | e @@ -187,7 +187,7 @@ COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|'; Jackson, Sam|\h It is "perfect".| ''| -COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\'; +COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii'; "Jackson, Sam","\\h" "It is \"perfect\"."," " "", diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index c2e8b037e74..6322c8fba43 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -72,7 +72,7 @@ COPY x from stdin; 2002 232 40 50 60 70 80 \. --- various COPY options: delimiters, oids, NULL string +-- various COPY options: delimiters, oids, NULL string, encoding COPY x (b, c, d, e) from stdin with oids delimiter ',' null 'x'; 500000,x,45,80,90 500001,x,\x,\\x,\\\x @@ -83,7 +83,7 @@ COPY x from stdin WITH DELIMITER AS ';' NULL AS ''; 3000;;c;; \. -COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X'; +COPY x from stdin WITH DELIMITER AS ':' NULL AS E'\\X' ENCODING 'sql_ascii'; 4000:\X:C:\X:\X 4001:1:empty:: 4002:2:null:\X:\X @@ -127,7 +127,7 @@ INSERT INTO y VALUES ('', NULL); COPY y TO stdout WITH CSV; COPY y TO stdout WITH CSV QUOTE '''' DELIMITER '|'; -COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\'; +COPY y TO stdout WITH CSV FORCE QUOTE col2 ESCAPE E'\\' ENCODING 'sql_ascii'; COPY y TO stdout WITH CSV FORCE QUOTE *; -- Repeat above tests with new 9.0 option syntax |