Do COPY FROM encoding conversion/verification in larger chunks.

This gives a small performance gain, by reducing the number of calls to the conversion/verification function, and letting it work with larger inputs. Also, reorganizing the input pipeline makes it easier to parallelize the input parsing: after the input has been converted to the database encoding, the next stage of finding the newlines can be done in parallel, because there cannot be any newline chars "embedded" in multi-byte characters in the encodings that we support as server encodings. This changes behavior in one corner case: if client and server encodings are the same single-byte encoding (e.g. latin1), previously the input would not be checked for zero bytes ('\0'). Any fields containing zero bytes would be truncated at the zero. But if encoding conversion was needed, the conversion routine would throw an error on the zero. After this commit, the input is always checked for zeros. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2021-04-01 12:23:40 +0300
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2021-04-01 12:23:40 +0300
commit: f82de5c46bdf8cd65812a7b04c9509c218e1545d (patch)
tree: f9d687f0e1f50666a4a4cf8fbe366a2cd7e43d1c /src/backend/commands/copyfromparse.c
parent: ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (diff)
download: postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.tar.gz
postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.zip
1 files changed, 396 insertions, 126 deletions
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index ce24a1528bd..0813424768f 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -3,6 +3,50 @@
  * copyfromparse.c
  *		Parse CSV/text/binary format for COPY FROM.
  *
+ * This file contains routines to parse the text, CSV and binary input
+ * formats.  The main entry point is NextCopyFrom(), which parses the
+ * next input line and returns it as Datums.
+ *
+ * In text/CSV mode, the parsing happens in multiple stages:
+ *
+ * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
+ *                1.          2.            3.           4.
+ *
+ * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
+ *    places it into 'raw_buf'.
+ *
+ * 2. CopyConvertBuf() calls the encoding conversion function to convert
+ *    the data in 'raw_buf' from client to server encoding, placing the
+ *    converted result in 'input_buf'.
+ *
+ * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
+ *    It is responsible for finding the next newline marker, taking quote and
+ *    escape characters into account according to the COPY options.  The line
+ *    is copied into 'line_buf', with quotes and escape characters still
+ *    intact.
+ *
+ * 4. CopyReadAttributesText/CSV() function takes the input line from
+ *    'line_buf', and splits it into fields, unescaping the data as required.
+ *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
+ *    pointers to each field.
+ *
+ * If encoding conversion is not required, a shortcut is taken in step 2 to
+ * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
+ * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
+ * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
+ * the data is valid in the current encoding.
+ *
+ * In binary mode, the pipeline is much simpler.  Input is loaded into
+ * into 'raw_buf', and encoding conversion is done in the datatype-specific
+ * receive functions, if required.  'input_buf' and 'line_buf' are not used,
+ * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
+ * data when it's passed the receive function.
+ *
+ * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
+ * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
+ * and 'attribute_buf' are expanded on demand, to hold the longest line
+ * encountered so far.
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -35,7 +79,7 @@
 #define OCTVALUE(c) ((c) - '0')
 
 /*
- * These macros centralize code used to process line_buf and raw_buf buffers.
+ * These macros centralize code used to process line_buf and input_buf buffers.
  * They are macros because they often do continue/break control and to avoid
  * function call overhead in tight COPY loops.
  *
@@ -53,9 +97,9 @@
 #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
 if (1) \
 { \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
+	if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
 	{ \
-		raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
+		input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
 		need_data = true; \
 		continue; \
 	} \
@@ -65,10 +109,10 @@ if (1) \
 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 if (1) \
 { \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
+	if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
 	{ \
 		if (extralen) \
-			raw_buf_ptr = copy_buf_len; /* consume the partial character */ \
+			input_buf_ptr = copy_buf_len; /* consume the partial character */ \
 		/* backslash just before EOF, treat as data char */ \
 		result = true; \
 		break; \
@@ -77,17 +121,17 @@ if (1) \
 
 /*
  * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
+ * there is some room in input_buf.
  */
 #define REFILL_LINEBUF \
 if (1) \
 { \
-	if (raw_buf_ptr > cstate->raw_buf_index) \
+	if (input_buf_ptr > cstate->input_buf_index) \
 	{ \
 		appendBinaryStringInfo(&cstate->line_buf, \
-							 cstate->raw_buf + cstate->raw_buf_index, \
-							   raw_buf_ptr - cstate->raw_buf_index); \
-		cstate->raw_buf_index = raw_buf_ptr; \
+							 cstate->input_buf + cstate->input_buf_index, \
+							   input_buf_ptr - cstate->input_buf_index); \
+		cstate->input_buf_index = input_buf_ptr; \
 	} \
 } else ((void) 0)
 
@@ -95,7 +139,7 @@ if (1) \
 #define NO_END_OF_COPY_GOTO \
 if (1) \
 { \
-	raw_buf_ptr = prev_raw_ptr + 1; \
+	input_buf_ptr = prev_raw_ptr + 1; \
 	goto not_end_of_copy; \
 } else ((void) 0)
 
@@ -118,7 +162,7 @@ static int	CopyGetData(CopyFromState cstate, void *databuf,
 						int minread, int maxread);
 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
-static bool CopyLoadRawBuf(CopyFromState cstate);
+static void CopyLoadInputBuf(CopyFromState cstate);
 static int	CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
 
 void
@@ -210,10 +254,10 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 						(errcode_for_file_access(),
 						 errmsg("could not read from COPY file: %m")));
 			if (bytesread == 0)
-				cstate->reached_eof = true;
+				cstate->raw_reached_eof = true;
 			break;
 		case COPY_FRONTEND:
-			while (maxread > 0 && bytesread < minread && !cstate->reached_eof)
+			while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
 			{
 				int			avail;
 
@@ -241,7 +285,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 							break;
 						case 'c':	/* CopyDone */
 							/* COPY IN correctly terminated by frontend */
-							cstate->reached_eof = true;
+							cstate->raw_reached_eof = true;
 							return bytesread;
 						case 'f':	/* CopyFail */
 							ereport(ERROR,
@@ -327,34 +371,303 @@ CopyGetInt16(CopyFromState cstate, int16 *val)
 
 
 /*
- * CopyLoadRawBuf loads some more data into raw_buf
+ * Perform encoding conversion on data in 'raw_buf', writing the converted
+ * data into 'input_buf'.
  *
- * Returns true if able to obtain at least one more byte, else false.
+ * On entry, there must be some data to convert in 'raw_buf'.
+ */
+static void
+CopyConvertBuf(CopyFromState cstate)
+{
+	/*
+	 * If the file and server encoding are the same, no encoding conversion is
+	 * required.  However, we still need to verify that the input is valid for
+	 * the encoding.
+	 */
+	if (!cstate->need_transcoding)
+	{
+		/*
+		 * When conversion is not required, input_buf and raw_buf are the
+		 * same.  raw_buf_len is the total number of bytes in the buffer, and
+		 * input_buf_len tracks how many of those bytes have already been
+		 * verified.
+		 */
+		int			preverifiedlen = cstate->input_buf_len;
+		int			unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
+		int			nverified;
+
+		if (unverifiedlen == 0)
+		{
+			/*
+			 * If no more raw data is coming, report the EOF to the caller.
+			 */
+			if (cstate->raw_reached_eof)
+				cstate->input_reached_eof = true;
+			return;
+		}
+
+		/*
+		 * Verify the new data, including any residual unverified bytes from
+		 * previous round.
+		 */
+		nverified = pg_encoding_verifymbstr(cstate->file_encoding,
+											cstate->raw_buf + preverifiedlen,
+											unverifiedlen);
+		if (nverified == 0)
+		{
+			/*
+			 * Could not verify anything.
+			 *
+			 * If there is no more raw input data coming, it means that there
+			 * was an incomplete multi-byte sequence at the end.  Also, if
+			 * there's "enough" input left, we should be able to verify at
+			 * least one character, and a failure to do so means that we've
+			 * hit an invalid byte sequence.
+			 */
+			if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length())
+				cstate->input_reached_error = true;
+			return;
+		}
+		cstate->input_buf_len += nverified;
+	}
+	else
+	{
+		/*
+		 * Encoding conversion is needed.
+		 */
+		int			nbytes;
+		unsigned char *src;
+		int			srclen;
+		unsigned char *dst;
+		int			dstlen;
+		int			convertedlen;
+
+		if (RAW_BUF_BYTES(cstate) == 0)
+		{
+			/*
+			 * If no more raw data is coming, report the EOF to the caller.
+			 */
+			if (cstate->raw_reached_eof)
+				cstate->input_reached_eof = true;
+			return;
+		}
+
+		/*
+		 * First, copy down any unprocessed data.
+		 */
+		nbytes = INPUT_BUF_BYTES(cstate);
+		if (nbytes > 0 && cstate->input_buf_index > 0)
+			memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
+					nbytes);
+		cstate->input_buf_index = 0;
+		cstate->input_buf_len = nbytes;
+		cstate->input_buf[nbytes] = '\0';
+
+		src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
+		srclen = cstate->raw_buf_len - cstate->raw_buf_index;
+		dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
+		dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
+
+		/*
+		 * Do the conversion.  This might stop short, if there is an invalid
+		 * byte sequence in the input.  We'll convert as much as we can in
+		 * that case.
+		 *
+		 * Note: Even if we hit an invalid byte sequence, we don't report the
+		 * error until all the valid bytes have been consumed.  The input
+		 * might contain an end-of-input marker (\.), and we don't want to
+		 * report an error if the invalid byte sequence is after the
+		 * end-of-input marker.  We might unnecessarily convert some data
+		 * after the end-of-input marker as long as it's valid for the
+		 * encoding, but that's harmless.
+		 */
+		convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
+													 cstate->file_encoding,
+													 GetDatabaseEncoding(),
+													 src, srclen,
+													 dst, dstlen,
+													 true);
+		if (convertedlen == 0)
+		{
+			/*
+			 * Could not convert anything.  If there is no more raw input data
+			 * coming, it means that there was an incomplete multi-byte
+			 * sequence at the end.  Also, if there is plenty of input left,
+			 * we should be able to convert at least one character, so a
+			 * failure to do so must mean that we've hit a byte sequence
+			 * that's invalid.
+			 */
+			if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
+				cstate->input_reached_error = true;
+			return;
+		}
+		cstate->raw_buf_index += convertedlen;
+		cstate->input_buf_len += strlen((char *) dst);
+	}
+}
+
+/*
+ * Report an encoding or conversion error.
+ */
+static void
+CopyConversionError(CopyFromState cstate)
+{
+	Assert(cstate->raw_buf_len > 0);
+	Assert(cstate->input_reached_error);
+
+	if (!cstate->need_transcoding)
+	{
+		/*
+		 * Everything up to input_buf_len was successfully verified, and
+		 * input_buf_len points to the invalid or incomplete character.
+		 */
+		report_invalid_encoding(cstate->file_encoding,
+								cstate->raw_buf + cstate->input_buf_len,
+								cstate->raw_buf_len - cstate->input_buf_len);
+	}
+	else
+	{
+		/*
+		 * raw_buf_index points to the invalid or untranslatable character. We
+		 * let the conversion routine report the error, because it can provide
+		 * a more specific error message than we could here.  An earlier call
+		 * to the conversion routine in CopyConvertBuf() detected that there
+		 * is an error, now we call the conversion routine again with
+		 * noError=false, to have it throw the error.
+		 */
+		unsigned char *src;
+		int			srclen;
+		unsigned char *dst;
+		int			dstlen;
+
+		src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
+		srclen = cstate->raw_buf_len - cstate->raw_buf_index;
+		dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
+		dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
+
+		(void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
+											 cstate->file_encoding,
+											 GetDatabaseEncoding(),
+											 src, srclen,
+											 dst, dstlen,
+											 false);
+
+		/*
+		 * The conversion routine should have reported an error, so this
+		 * should not be reached.
+		 */
+		elog(ERROR, "encoding conversion failed without error");
+	}
+}
+
+/*
+ * Load more data from data source to raw_buf.
  *
- * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
- * of the buffer and then we load more data after that.  This case occurs only
- * when a multibyte character crosses a bufferload boundary.
+ * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
+ * beginning of the buffer, and we load new data after that.
  */
-static bool
+static void
 CopyLoadRawBuf(CopyFromState cstate)
 {
-	int			nbytes = RAW_BUF_BYTES(cstate);
+	int			nbytes;
 	int			inbytes;
 
-	/* Copy down the unprocessed data if any. */
-	if (nbytes > 0)
+	/*
+	 * In text mode, if encoding conversion is not required, raw_buf and
+	 * input_buf point to the same buffer.  Their len/index better agree, too.
+	 */
+	if (cstate->raw_buf == cstate->input_buf)
+	{
+		Assert(!cstate->need_transcoding);
+		Assert(cstate->raw_buf_index == cstate->input_buf_index);
+		Assert(cstate->input_buf_len <= cstate->raw_buf_len);
+	}
+
+	/*
+	 * Copy down the unprocessed data if any.
+	 */
+	nbytes = RAW_BUF_BYTES(cstate);
+	if (nbytes > 0 && cstate->raw_buf_index > 0)
 		memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
 				nbytes);
+	cstate->raw_buf_len -= cstate->raw_buf_index;
+	cstate->raw_buf_index = 0;
+
+	/*
+	 * If raw_buf and input_buf are in fact the same buffer, adjust the
+	 * input_buf variables, too.
+	 */
+	if (cstate->raw_buf == cstate->input_buf)
+	{
+		cstate->input_buf_len -= cstate->input_buf_index;
+		cstate->input_buf_index = 0;
+	}
 
-	inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
-						  1, RAW_BUF_SIZE - nbytes);
+	/* Load more data */
+	inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
+						  1, RAW_BUF_SIZE - cstate->raw_buf_len);
 	nbytes += inbytes;
 	cstate->raw_buf[nbytes] = '\0';
-	cstate->raw_buf_index = 0;
 	cstate->raw_buf_len = nbytes;
+
 	cstate->bytes_processed += inbytes;
 	pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
-	return (inbytes > 0);
+
+	if (inbytes == 0)
+		cstate->raw_reached_eof = true;
+}
+
+/*
+ * CopyLoadInputBuf loads some more data into input_buf
+ *
+ * On return, at least one more input character is loaded into
+ * input_buf, or input_reached_eof is set.
+ *
+ * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
+ * of the buffer and then we load more data after that.
+ */
+static void
+CopyLoadInputBuf(CopyFromState cstate)
+{
+	int			nbytes = INPUT_BUF_BYTES(cstate);
+
+	/*
+	 * The caller has updated input_buf_index to indicate how much of the
+	 * input has been consumed and isn't needed anymore.  If input_buf is the
+	 * same physical area as raw_buf, update raw_buf_index accordingly.
+	 */
+	if (cstate->raw_buf == cstate->input_buf)
+	{
+		Assert(!cstate->need_transcoding);
+		Assert(cstate->input_buf_index >= cstate->raw_buf_index);
+		cstate->raw_buf_index = cstate->input_buf_index;
+	}
+
+	for (;;)
+	{
+		/* If we now have some unconverted data, try to convert it */
+		CopyConvertBuf(cstate);
+
+		/* If we now have some more input bytes ready, return them */
+		if (INPUT_BUF_BYTES(cstate) > nbytes)
+			return;
+
+		/*
+		 * If we reached an invalid byte sequence, or we're at an incomplete
+		 * multi-byte character but there is no more raw input data, report
+		 * conversion error.
+		 */
+		if (cstate->input_reached_error)
+			CopyConversionError(cstate);
+
+		/* no more input, and everything has been converted */
+		if (cstate->input_reached_eof)
+			break;
+
+		/* Try to load more raw data */
+		Assert(!cstate->raw_reached_eof);
+		CopyLoadRawBuf(cstate);
+	}
 }
 
 /*
@@ -389,7 +702,8 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
 			/* Load more data if buffer is empty. */
 			if (RAW_BUF_BYTES(cstate) == 0)
 			{
-				if (!CopyLoadRawBuf(cstate))
+				CopyLoadRawBuf(cstate);
+				if (cstate->raw_reached_eof)
 					break;		/* EOF */
 			}
 
@@ -645,8 +959,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 }
 
 /*
- * Read the next input line and stash it in line_buf, with conversion to
- * server encoding.
+ * Read the next input line and stash it in line_buf.
  *
  * Result is true if read was terminated by EOF, false if terminated
  * by newline.  The terminating newline or EOF marker is not included
@@ -658,10 +971,7 @@ CopyReadLine(CopyFromState cstate)
 	bool		result;
 
 	resetStringInfo(&cstate->line_buf);
-	cstate->line_buf_valid = true;
-
-	/* Mark that encoding conversion hasn't occurred yet */
-	cstate->line_buf_converted = false;
+	cstate->line_buf_valid = false;
 
 	/* Parse data and transfer into line_buf */
 	result = CopyReadLineText(cstate);
@@ -675,10 +985,17 @@ CopyReadLine(CopyFromState cstate)
 		 */
 		if (cstate->copy_src == COPY_FRONTEND)
 		{
+			int			inbytes;
+
 			do
 			{
-				cstate->raw_buf_index = cstate->raw_buf_len;
-			} while (CopyLoadRawBuf(cstate));
+				inbytes = CopyGetData(cstate, cstate->input_buf,
+									  1, INPUT_BUF_SIZE);
+			} while (inbytes > 0);
+			cstate->input_buf_index = 0;
+			cstate->input_buf_len = 0;
+			cstate->raw_buf_index = 0;
+			cstate->raw_buf_len = 0;
 		}
 	}
 	else
@@ -715,25 +1032,8 @@ CopyReadLine(CopyFromState cstate)
 		}
 	}
 
-	/* Done reading the line.  Convert it to server encoding. */
-	if (cstate->need_transcoding)
-	{
-		char	   *cvt;
-
-		cvt = pg_any_to_server(cstate->line_buf.data,
-							   cstate->line_buf.len,
-							   cstate->file_encoding);
-		if (cvt != cstate->line_buf.data)
-		{
-			/* transfer converted data back to line_buf */
-			resetStringInfo(&cstate->line_buf);
-			appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt));
-			pfree(cvt);
-		}
-	}
-
 	/* Now it's safe to use the buffer in error messages */
-	cstate->line_buf_converted = true;
+	cstate->line_buf_valid = true;
 
 	return result;
 }
@@ -744,13 +1044,12 @@ CopyReadLine(CopyFromState cstate)
 static bool
 CopyReadLineText(CopyFromState cstate)
 {
-	char	   *copy_raw_buf;
-	int			raw_buf_ptr;
+	char	   *copy_input_buf;
+	int			input_buf_ptr;
 	int			copy_buf_len;
 	bool		need_data = false;
 	bool		hit_eof = false;
 	bool		result = false;
-	char		mblen_str[2];
 
 	/* CSV variables */
 	bool		first_char_in_line = true;
@@ -768,8 +1067,6 @@ CopyReadLineText(CopyFromState cstate)
 			escapec = '\0';
 	}
 
-	mblen_str[1] = '\0';
-
 	/*
 	 * The objective of this loop is to transfer the entire next input line
 	 * into line_buf.  Hence, we only care for detecting newlines (\r and/or
@@ -782,18 +1079,25 @@ CopyReadLineText(CopyFromState cstate)
 	 * These four characters, and the CSV escape and quote characters, are
 	 * assumed the same in frontend and backend encodings.
 	 *
-	 * For speed, we try to move data from raw_buf to line_buf in chunks
-	 * rather than one character at a time.  raw_buf_ptr points to the next
-	 * character to examine; any characters from raw_buf_index to raw_buf_ptr
-	 * have been determined to be part of the line, but not yet transferred to
-	 * line_buf.
+	 * The input has already been converted to the database encoding.  All
+	 * supported server encodings have the property that all bytes in a
+	 * multi-byte sequence have the high bit set, so a multibyte character
+	 * cannot contain any newline or escape characters embedded in the
+	 * multibyte sequence.  Therefore, we can process the input byte-by-byte,
+	 * regardless of the encoding.
 	 *
-	 * For a little extra speed within the loop, we copy raw_buf and
-	 * raw_buf_len into local variables.
+	 * For speed, we try to move data from input_buf to line_buf in chunks
+	 * rather than one character at a time.  input_buf_ptr points to the next
+	 * character to examine; any characters from input_buf_index to
+	 * input_buf_ptr have been determined to be part of the line, but not yet
+	 * transferred to line_buf.
+	 *
+	 * For a little extra speed within the loop, we copy input_buf and
+	 * input_buf_len into local variables.
 	 */
-	copy_raw_buf = cstate->raw_buf;
-	raw_buf_ptr = cstate->raw_buf_index;
-	copy_buf_len = cstate->raw_buf_len;
+	copy_input_buf = cstate->input_buf;
+	input_buf_ptr = cstate->input_buf_index;
+	copy_buf_len = cstate->input_buf_len;
 
 	for (;;)
 	{
@@ -810,24 +1114,21 @@ CopyReadLineText(CopyFromState cstate)
 		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
 		 * considering the size of the buffer.
 		 */
-		if (raw_buf_ptr >= copy_buf_len || need_data)
+		if (input_buf_ptr >= copy_buf_len || need_data)
 		{
 			REFILL_LINEBUF;
 
-			/*
-			 * Try to read some more data.  This will certainly reset
-			 * raw_buf_index to zero, and raw_buf_ptr must go with it.
-			 */
-			if (!CopyLoadRawBuf(cstate))
-				hit_eof = true;
-			raw_buf_ptr = 0;
-			copy_buf_len = cstate->raw_buf_len;
+			CopyLoadInputBuf(cstate);
+			/* update our local variables */
+			hit_eof = cstate->input_reached_eof;
+			input_buf_ptr = cstate->input_buf_index;
+			copy_buf_len = cstate->input_buf_len;
 
 			/*
 			 * If we are completely out of data, break out of the loop,
 			 * reporting EOF.
 			 */
-			if (copy_buf_len <= 0)
+			if (INPUT_BUF_BYTES(cstate) <= 0)
 			{
 				result = true;
 				break;
@@ -836,8 +1137,8 @@ CopyReadLineText(CopyFromState cstate)
 		}
 
 		/* OK to fetch a character */
-		prev_raw_ptr = raw_buf_ptr;
-		c = copy_raw_buf[raw_buf_ptr++];
+		prev_raw_ptr = input_buf_ptr;
+		c = copy_input_buf[input_buf_ptr++];
 
 		if (cstate->opts.csv_mode)
 		{
@@ -891,16 +1192,16 @@ CopyReadLineText(CopyFromState cstate)
 				 * If need more data, go back to loop top to load it.
 				 *
 				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of raw_buf.
+				 * of the guaranteed pad of input_buf.
 				 */
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 
 				/* get next char */
-				c = copy_raw_buf[raw_buf_ptr];
+				c = copy_input_buf[input_buf_ptr];
 
 				if (c == '\n')
 				{
-					raw_buf_ptr++;	/* eat newline */
+					input_buf_ptr++;	/* eat newline */
 					cstate->eol_type = EOL_CRNL;	/* in case not set yet */
 				}
 				else
@@ -967,14 +1268,14 @@ CopyReadLineText(CopyFromState cstate)
 			/* -----
 			 * get next character
 			 * Note: we do not change c so if it isn't \., we can fall
-			 * through and continue processing for file encoding.
+			 * through and continue processing.
 			 * -----
 			 */
-			c2 = copy_raw_buf[raw_buf_ptr];
+			c2 = copy_input_buf[input_buf_ptr];
 
 			if (c2 == '.')
 			{
-				raw_buf_ptr++;	/* consume the '.' */
+				input_buf_ptr++;	/* consume the '.' */
 
 				/*
 				 * Note: if we loop back for more data here, it does not
@@ -986,7 +1287,7 @@ CopyReadLineText(CopyFromState cstate)
 					/* Get the next character */
 					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 					/* if hit_eof, c2 will become '\0' */
-					c2 = copy_raw_buf[raw_buf_ptr++];
+					c2 = copy_input_buf[input_buf_ptr++];
 
 					if (c2 == '\n')
 					{
@@ -1011,7 +1312,7 @@ CopyReadLineText(CopyFromState cstate)
 				/* Get the next character */
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 				/* if hit_eof, c2 will become '\0' */
-				c2 = copy_raw_buf[raw_buf_ptr++];
+				c2 = copy_input_buf[input_buf_ptr++];
 
 				if (c2 != '\r' && c2 != '\n')
 				{
@@ -1036,11 +1337,11 @@ CopyReadLineText(CopyFromState cstate)
 				 * Transfer only the data before the \. into line_buf, then
 				 * discard the data and the \. sequence.
 				 */
-				if (prev_raw_ptr > cstate->raw_buf_index)
+				if (prev_raw_ptr > cstate->input_buf_index)
 					appendBinaryStringInfo(&cstate->line_buf,
-										   cstate->raw_buf + cstate->raw_buf_index,
-										   prev_raw_ptr - cstate->raw_buf_index);
-				cstate->raw_buf_index = raw_buf_ptr;
+										   cstate->input_buf + cstate->input_buf_index,
+										   prev_raw_ptr - cstate->input_buf_index);
+				cstate->input_buf_index = input_buf_ptr;
 				result = true;	/* report EOF */
 				break;
 			}
@@ -1056,15 +1357,8 @@ CopyReadLineText(CopyFromState cstate)
 				 * backslashes are not special, so we want to process the
 				 * character after the backslash just like a normal character,
 				 * so we don't increment in those cases.
-				 *
-				 * Set 'c' to skip whole character correctly in multi-byte
-				 * encodings.  If we don't have the whole character in the
-				 * buffer yet, we might loop back to process it, after all,
-				 * but that's OK because multi-byte characters cannot have any
-				 * special meaning.
 				 */
-				raw_buf_ptr++;
-				c = c2;
+				input_buf_ptr++;
 			}
 		}
 
@@ -1075,30 +1369,6 @@ CopyReadLineText(CopyFromState cstate)
 		 * value, while in non-CSV mode, \. cannot be a data value.
 		 */
 not_end_of_copy:
-
-		/*
-		 * Process all bytes of a multi-byte character as a group.
-		 *
-		 * We only support multi-byte sequences where the first byte has the
-		 * high-bit set, so as an optimization we can avoid this block
-		 * entirely if it is not set.
-		 */
-		if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
-		{
-			int			mblen;
-
-			/*
-			 * It is enough to look at the first byte in all our encodings, to
-			 * get the length.  (GB18030 is a bit special, but still works for
-			 * our purposes; see comment in pg_gb18030_mblen())
-			 */
-			mblen_str[0] = c;
-			mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
-
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
-			IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
-			raw_buf_ptr += mblen - 1;
-		}
 		first_char_in_line = false;
 	}							/* end of outer loop */
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2021-04-01 12:23:40 +0300
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2021-04-01 12:23:40 +0300
commit	f82de5c46bdf8cd65812a7b04c9509c218e1545d (patch)
tree	f9d687f0e1f50666a4a4cf8fbe366a2cd7e43d1c /src/backend/commands/copyfromparse.c
parent	ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (diff)
download	postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.tar.gz postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.zip