aboutsummaryrefslogtreecommitdiff
path: root/src/include/commands/copyfrom_internal.h
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2021-04-01 12:23:40 +0300
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2021-04-01 12:23:40 +0300
commitf82de5c46bdf8cd65812a7b04c9509c218e1545d (patch)
treef9d687f0e1f50666a4a4cf8fbe366a2cd7e43d1c /src/include/commands/copyfrom_internal.h
parentea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (diff)
downloadpostgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.tar.gz
postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.zip
Do COPY FROM encoding conversion/verification in larger chunks.
This gives a small performance gain, by reducing the number of calls to the conversion/verification function, and letting it work with larger inputs. Also, reorganizing the input pipeline makes it easier to parallelize the input parsing: after the input has been converted to the database encoding, the next stage of finding the newlines can be done in parallel, because there cannot be any newline chars "embedded" in multi-byte characters in the encodings that we support as server encodings. This changes behavior in one corner case: if client and server encodings are the same single-byte encoding (e.g. latin1), previously the input would not be checked for zero bytes ('\0'). Any fields containing zero bytes would be truncated at the zero. But if encoding conversion was needed, the conversion routine would throw an error on the zero. After this commit, the input is always checked for zeros. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
Diffstat (limited to 'src/include/commands/copyfrom_internal.h')
-rw-r--r--src/include/commands/copyfrom_internal.h62
1 files changed, 35 insertions, 27 deletions
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index 705f5b615be..858af7a717b 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -52,17 +52,6 @@ typedef enum CopyInsertMethod
/*
* This struct contains all the state variables used throughout a COPY FROM
* operation.
- *
- * Multi-byte encodings: all supported client-side encodings encode multi-byte
- * characters by having the first byte's high bit set. Subsequent bytes of the
- * character can have the high bit not set. When scanning data in such an
- * encoding to look for a match to a single-byte (ie ASCII) character, we must
- * use the full pg_encoding_mblen() machinery to skip over multibyte
- * characters, else we might find a false match to a trailing byte. In
- * supported server encodings, there is no possibility of a false match, and
- * it's faster to make useless comparisons to trailing bytes than it is to
- * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true
- * when we have to do it the hard way.
*/
typedef struct CopyFromStateData
{
@@ -70,13 +59,11 @@ typedef struct CopyFromStateData
CopySource copy_src; /* type of copy source */
FILE *copy_file; /* used if copy_src == COPY_FILE */
StringInfo fe_msgbuf; /* used if copy_src == COPY_NEW_FE */
- bool reached_eof; /* true if we read to end of copy data (not
- * all copy_src types maintain this) */
EolType eol_type; /* EOL type of input */
int file_encoding; /* file or remote side's character encoding */
bool need_transcoding; /* file encoding diff from server? */
- bool encoding_embeds_ascii; /* ASCII can be non-first byte? */
+ Oid conversion_proc; /* encoding conversion function */
/* parameters from the COPY command */
Relation rel; /* relation to copy from */
@@ -131,31 +118,52 @@ typedef struct CopyFromStateData
/*
* Similarly, line_buf holds the whole input line being processed. The
- * input cycle is first to read the whole line into line_buf, convert it
- * to server encoding there, and then extract the individual attribute
- * fields into attribute_buf. line_buf is preserved unmodified so that we
- * can display it in error messages if appropriate. (In binary mode,
- * line_buf is not used.)
+ * input cycle is first to read the whole line into line_buf, and then
+ * extract the individual attribute fields into attribute_buf. line_buf
+ * is preserved unmodified so that we can display it in error messages if
+ * appropriate. (In binary mode, line_buf is not used.)
*/
StringInfoData line_buf;
- bool line_buf_converted; /* converted to server encoding? */
bool line_buf_valid; /* contains the row being processed? */
/*
- * Finally, raw_buf holds raw data read from the data source (file or
- * client connection). In text mode, CopyReadLine parses this data
- * sufficiently to locate line boundaries, then transfers the data to
- * line_buf and converts it. In binary mode, CopyReadBinaryData fetches
- * appropriate amounts of data from this buffer. In both modes, we
- * guarantee that there is a \0 at raw_buf[raw_buf_len].
+ * input_buf holds input data, already converted to database encoding.
+ *
+ * In text mode, CopyReadLine parses this data sufficiently to locate
+ * line boundaries, then transfers the data to line_buf. We guarantee
+ * that there is a \0 at input_buf[input_buf_len] at all times. (In
+ * binary mode, input_buf is not used.)
+ *
+ * If encoding conversion is not required, input_buf is not a separate
+ * buffer but points directly to raw_buf. In that case, input_buf_len
+ * tracks the number of bytes that have been verified as valid in the
+ * database encoding, and raw_buf_len is the total number of bytes
+ * stored in the buffer.
+ */
+#define INPUT_BUF_SIZE 65536 /* we palloc INPUT_BUF_SIZE+1 bytes */
+ char *input_buf;
+ int input_buf_index; /* next byte to process */
+ int input_buf_len; /* total # of bytes stored */
+ bool input_reached_eof; /* true if we reached EOF */
+ bool input_reached_error; /* true if a conversion error happened */
+ /* Shorthand for number of unconsumed bytes available in input_buf */
+#define INPUT_BUF_BYTES(cstate) ((cstate)->input_buf_len - (cstate)->input_buf_index)
+
+ /*
+ * raw_buf holds raw input data read from the data source (file or client
+ * connection), not yet converted to the database encoding. Like with
+ * 'input_buf', we guarantee that there is a \0 at raw_buf[raw_buf_len].
*/
#define RAW_BUF_SIZE 65536 /* we palloc RAW_BUF_SIZE+1 bytes */
char *raw_buf;
int raw_buf_index; /* next byte to process */
int raw_buf_len; /* total # of bytes stored */
- uint64 bytes_processed;/* number of bytes processed so far */
+ bool raw_reached_eof; /* true if we reached EOF */
+
/* Shorthand for number of unconsumed bytes available in raw_buf */
#define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
+
+ uint64 bytes_processed; /* number of bytes processed so far */
} CopyFromStateData;
extern void ReceiveCopyBegin(CopyFromState cstate);