1 files changed, 396 insertions, 126 deletions
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index ce24a1528bd..0813424768f 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -3,6 +3,50 @@
  * copyfromparse.c
  *		Parse CSV/text/binary format for COPY FROM.
  *
+ * This file contains routines to parse the text, CSV and binary input
+ * formats.  The main entry point is NextCopyFrom(), which parses the
+ * next input line and returns it as Datums.
+ *
+ * In text/CSV mode, the parsing happens in multiple stages:
+ *
+ * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
+ *                1.          2.            3.           4.
+ *
+ * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
+ *    places it into 'raw_buf'.
+ *
+ * 2. CopyConvertBuf() calls the encoding conversion function to convert
+ *    the data in 'raw_buf' from client to server encoding, placing the
+ *    converted result in 'input_buf'.
+ *
+ * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
+ *    It is responsible for finding the next newline marker, taking quote and
+ *    escape characters into account according to the COPY options.  The line
+ *    is copied into 'line_buf', with quotes and escape characters still
+ *    intact.
+ *
+ * 4. CopyReadAttributesText/CSV() function takes the input line from
+ *    'line_buf', and splits it into fields, unescaping the data as required.
+ *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
+ *    pointers to each field.
+ *
+ * If encoding conversion is not required, a shortcut is taken in step 2 to
+ * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
+ * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
+ * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
+ * the data is valid in the current encoding.
+ *
+ * In binary mode, the pipeline is much simpler.  Input is loaded into
+ * into 'raw_buf', and encoding conversion is done in the datatype-specific
+ * receive functions, if required.  'input_buf' and 'line_buf' are not used,
+ * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
+ * data when it's passed the receive function.
+ *
+ * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
+ * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
+ * and 'attribute_buf' are expanded on demand, to hold the longest line
+ * encountered so far.
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -35,7 +79,7 @@
 #define OCTVALUE(c) ((c) - '0')
 
 /*
- * These macros centralize code used to process line_buf and raw_buf buffers.
+ * These macros centralize code used to process line_buf and input_buf buffers.
  * They are macros because they often do continue/break control and to avoid
  * function call overhead in tight COPY loops.
  *
@@ -53,9 +97,9 @@
 #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
 if (1) \
 { \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
+	if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
 	{ \
-		raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
+		input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
 		need_data = true; \
 		continue; \
 	} \
@@ -65,10 +109,10 @@ if (1) \
 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
 if (1) \
 { \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
+	if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
 	{ \
 		if (extralen) \
-			raw_buf_ptr = copy_buf_len; /* consume the partial character */ \
+			input_buf_ptr = copy_buf_len; /* consume the partial character */ \
 		/* backslash just before EOF, treat as data char */ \
 		result = true; \
 		break; \
@@ -77,17 +121,17 @@ if (1) \
 
 /*
  * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
+ * there is some room in input_buf.
  */
 #define REFILL_LINEBUF \
 if (1) \
 { \
-	if (raw_buf_ptr > cstate->raw_buf_index) \
+	if (input_buf_ptr > cstate->input_buf_index) \
 	{ \
 		appendBinaryStringInfo(&cstate->line_buf, \
-							 cstate->raw_buf + cstate->raw_buf_index, \
-							   raw_buf_ptr - cstate->raw_buf_index); \
-		cstate->raw_buf_index = raw_buf_ptr; \
+							 cstate->input_buf + cstate->input_buf_index, \
+							   input_buf_ptr - cstate->input_buf_index); \
+		cstate->input_buf_index = input_buf_ptr; \
 	} \
 } else ((void) 0)
 
@@ -95,7 +139,7 @@ if (1) \
 #define NO_END_OF_COPY_GOTO \
 if (1) \
 { \
-	raw_buf_ptr = prev_raw_ptr + 1; \
+	input_buf_ptr = prev_raw_ptr + 1; \
 	goto not_end_of_copy; \
 } else ((void) 0)
 
@@ -118,7 +162,7 @@ static int	CopyGetData(CopyFromState cstate, void *databuf,
 						int minread, int maxread);
 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
-static bool CopyLoadRawBuf(CopyFromState cstate);
+static void CopyLoadInputBuf(CopyFromState cstate);
 static int	CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
 
 void
@@ -210,10 +254,10 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 						(errcode_for_file_access(),
 						 errmsg("could not read from COPY file: %m")));
 			if (bytesread == 0)
-				cstate->reached_eof = true;
+				cstate->raw_reached_eof = true;
 			break;
 		case COPY_FRONTEND:
-			while (maxread > 0 && bytesread < minread && !cstate->reached_eof)
+			while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
 			{
 				int			avail;
 
@@ -241,7 +285,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
 							break;
 						case 'c':	/* CopyDone */
 							/* COPY IN correctly terminated by frontend */
-							cstate->reached_eof = true;
+							cstate->raw_reached_eof = true;
 							return bytesread;
 						case 'f':	/* CopyFail */
 							ereport(ERROR,
@@ -327,34 +371,303 @@ CopyGetInt16(CopyFromState cstate, int16 *val)
 
 
 /*
- * CopyLoadRawBuf loads some more data into raw_buf
+ * Perform encoding conversion on data in 'raw_buf', writing the converted
+ * data into 'input_buf'.
  *
- * Returns true if able to obtain at least one more byte, else false.
+ * On entry, there must be some data to convert in 'raw_buf'.
+ */
+static void
+CopyConvertBuf(CopyFromState cstate)
+{
+	/*
+	 * If the file and server encoding are the same, no encoding conversion is
+	 * required.  However, we still need to verify that the input is valid for
+	 * the encoding.
+	 */
+	if (!cstate->need_transcoding)
+	{
+		/*
+		 * When conversion is not required, input_buf and raw_buf are the
+		 * same.  raw_buf_len is the total number of bytes in the buffer, and
+		 * input_buf_len tracks how many of those bytes have already been
+		 * verified.
+		 */
+		int			preverifiedlen = cstate->input_buf_len;
+		int			unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
+		int			nverified;
+
+		if (unverifiedlen == 0)
+		{
+			/*
+			 * If no more raw data is coming, report the EOF to the caller.
+			 */
+			if (cstate->raw_reached_eof)
+				cstate->input_reached_eof = true;
+			return;
+		}
+
+		/*
+		 * Verify the new data, including any residual unverified bytes from
+		 * previous round.
+		 */
+		nverified = pg_encoding_verifymbstr(cstate->file_encoding,
+											cstate->raw_buf + preverifiedlen,
+											unverifiedlen);
+		if (nverified == 0)
+		{
+			/*
+			 * Could not verify anything.
+			 *
+			 * If there is no more raw input data coming, it means that there
+			 * was an incomplete multi-byte sequence at the end.  Also, if
+			 * there's "enough" input left, we should be able to verify at
+			 * least one character, and a failure to do so means that we've
+			 * hit an invalid byte sequence.
+			 */
+			if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length())
+				cstate->input_reached_error = true;
+			return;
+		}
+		cstate->input_buf_len += nverified;
+	}
+	else
+	{
+		/*
+		 * Encoding conversion is needed.
+		 */
+		int			nbytes;
+		unsigned char *src;
+		int			srclen;
+		unsigned char *dst;
+		int			dstlen;
+		int			convertedlen;
+
+		if (RAW_BUF_BYTES(cstate) == 0)
+		{
+			/*
+			 * If no more raw data is coming, report the EOF to the caller.
+			 */
+			if (cstate->raw_reached_eof)
+				cstate->input_reached_eof = true;
+			return;
+		}
+
+		/*
+		 * First, copy down any unprocessed data.
+		 */
+		nbytes = INPUT_BUF_BYTES(cstate);
+		if (nbytes > 0 && cstate->input_buf_index > 0)
+			memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
+					nbytes);
+		cstate->input_buf_index = 0;
+		cstate->input_buf_len = nbytes;
+		cstate->input_buf[nbytes] = '\0';
+
+		src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
+		srclen = cstate->raw_buf_len - cstate->raw_buf_index;
+		dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
+		dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
+
+		/*
+		 * Do the conversion.  This might stop short, if there is an invalid
+		 * byte sequence in the input.  We'll convert as much as we can in
+		 * that case.
+		 *
+		 * Note: Even if we hit an invalid byte sequence, we don't report the
+		 * error until all the valid bytes have been consumed.  The input
+		 * might contain an end-of-input marker (\.), and we don't want to
+		 * report an error if the invalid byte sequence is after the
+		 * end-of-input marker.  We might unnecessarily convert some data
+		 * after the end-of-input marker as long as it's valid for the
+		 * encoding, but that's harmless.
+		 */
+		convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
+													 cstate->file_encoding,
+													 GetDatabaseEncoding(),
+													 src, srclen,
+													 dst, dstlen,
+													 true);
+		if (convertedlen == 0)
+		{
+			/*
+			 * Could not convert anything.  If there is no more raw input data
+			 * coming, it means that there was an incomplete multi-byte
+			 * sequence at the end.  Also, if there is plenty of input left,
+			 * we should be able to convert at least one character, so a
+			 * failure to do so must mean that we've hit a byte sequence
+			 * that's invalid.
+			 */
+			if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
+				cstate->input_reached_error = true;
+			return;
+		}
+		cstate->raw_buf_index += convertedlen;
+		cstate->input_buf_len += strlen((char *) dst);
+	}
+}
+
+/*
+ * Report an encoding or conversion error.
+ */
+static void
+CopyConversionError(CopyFromState cstate)
+{
+	Assert(cstate->raw_buf_len > 0);
+	Assert(cstate->input_reached_error);
+
+	if (!cstate->need_transcoding)
+	{
+		/*
+		 * Everything up to input_buf_len was successfully verified, and
+		 * input_buf_len points to the invalid or incomplete character.
+		 */
+		report_invalid_encoding(cstate->file_encoding,
+								cstate->raw_buf + cstate->input_buf_len,
+								cstate->raw_buf_len - cstate->input_buf_len);
+	}
+	else
+	{
+		/*
+		 * raw_buf_index points to the invalid or untranslatable character. We
+		 * let the conversion routine report the error, because it can provide
+		 * a more specific error message than we could here.  An earlier call
+		 * to the conversion routine in CopyConvertBuf() detected that there
+		 * is an error, now we call the conversion routine again with
+		 * noError=false, to have it throw the error.
+		 */
+		unsigned char *src;
+		int			srclen;
+		unsigned char *dst;
+		int			dstlen;
+
+		src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
+		srclen = cstate->raw_buf_len - cstate->raw_buf_index;
+		dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
+		dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
+
+		(void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
+											 cstate->file_encoding,
+											 GetDatabaseEncoding(),
+											 src, srclen,
+											 dst, dstlen,
+											 false);
+
+		/*
+		 * The conversion routine should have reported an error, so this
+		 * should not be reached.
+		 */
+		elog(ERROR, "encoding conversion failed without error");
+	}
+}
+
+/*
+ * Load more data from data source to raw_buf.
  *
- * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
- * of the buffer and then we load more data after that.  This case occurs only
- * when a multibyte character crosses a bufferload boundary.
+ * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
+ * beginning of the buffer, and we load new data after that.
  */
-static bool
+static void
 CopyLoadRawBuf(CopyFromState cstate)
 {
-	int			nbytes = RAW_BUF_BYTES(cstate);
+	int			nbytes;
 	int			inbytes;
 
-	/* Copy down the unprocessed data if any. */
-	if (nbytes > 0)
+	/*
+	 * In text mode, if encoding conversion is not required, raw_buf and
+	 * input_buf point to the same buffer.  Their len/index better agree, too.
+	 */
+	if (cstate->raw_buf == cstate->input_buf)
+	{
+		Assert(!cstate->need_transcoding);
+		Assert(cstate->raw_buf_index == cstate->input_buf_index);
+		Assert(cstate->input_buf_len <= cstate->raw_buf_len);
+	}
+
+	/*
+	 * Copy down the unprocessed data if any.
+	 */
+	nbytes = RAW_BUF_BYTES(cstate);
+	if (nbytes > 0 && cstate->raw_buf_index > 0)
 		memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
 				nbytes);
+	cstate->raw_buf_len -= cstate->raw_buf_index;
+	cstate->raw_buf_index = 0;
+
+	/*
+	 * If raw_buf and input_buf are in fact the same buffer, adjust the
+	 * input_buf variables, too.
+	 */
+	if (cstate->raw_buf == cstate->input_buf)
+	{
+		cstate->input_buf_len -= cstate->input_buf_index;
+		cstate->input_buf_index = 0;
+	}
 
-	inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
-						  1, RAW_BUF_SIZE - nbytes);
+	/* Load more data */
+	inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
+						  1, RAW_BUF_SIZE - cstate->raw_buf_len);
 	nbytes += inbytes;
 	cstate->raw_buf[nbytes] = '\0';
-	cstate->raw_buf_index = 0;
 	cstate->raw_buf_len = nbytes;
+
 	cstate->bytes_processed += inbytes;
 	pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
-	return (inbytes > 0);
+
+	if (inbytes == 0)
+		cstate->raw_reached_eof = true;
+}
+
+/*
+ * CopyLoadInputBuf loads some more data into input_buf
+ *
+ * On return, at least one more input character is loaded into
+ * input_buf, or input_reached_eof is set.
+ *
+ * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
+ * of the buffer and then we load more data after that.
+ */
+static void
+CopyLoadInputBuf(CopyFromState cstate)
+{
+	int			nbytes = INPUT_BUF_BYTES(cstate);
+
+	/*
+	 * The caller has updated input_buf_index to indicate how much of the
+	 * input has been consumed and isn't needed anymore.  If input_buf is the
+	 * same physical area as raw_buf, update raw_buf_index accordingly.
+	 */
+	if (cstate->raw_buf == cstate->input_buf)
+	{
+		Assert(!cstate->need_transcoding);
+		Assert(cstate->input_buf_index >= cstate->raw_buf_index);
+		cstate->raw_buf_index = cstate->input_buf_index;
+	}
+
+	for (;;)
+	{
+		/* If we now have some unconverted data, try to convert it */
+		CopyConvertBuf(cstate);
+
+		/* If we now have some more input bytes ready, return them */
+		if (INPUT_BUF_BYTES(cstate) > nbytes)
+			return;
+
+		/*
+		 * If we reached an invalid byte sequence, or we're at an incomplete
+		 * multi-byte character but there is no more raw input data, report
+		 * conversion error.
+		 */
+		if (cstate->input_reached_error)
+			CopyConversionError(cstate);
+
+		/* no more input, and everything has been converted */
+		if (cstate->input_reached_eof)
+			break;
+
+		/* Try to load more raw data */
+		Assert(!cstate->raw_reached_eof);
+		CopyLoadRawBuf(cstate);
+	}
 }
 
 /*
@@ -389,7 +702,8 @@ CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
 			/* Load more data if buffer is empty. */
 			if (RAW_BUF_BYTES(cstate) == 0)
 			{
-				if (!CopyLoadRawBuf(cstate))
+				CopyLoadRawBuf(cstate);
+				if (cstate->raw_reached_eof)
 					break;		/* EOF */
 			}
 
@@ -645,8 +959,7 @@ NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
 }
 
 /*
- * Read the next input line and stash it in line_buf, with conversion to
- * server encoding.
+ * Read the next input line and stash it in line_buf.
  *
  * Result is true if read was terminated by EOF, false if terminated
  * by newline.  The terminating newline or EOF marker is not included
@@ -658,10 +971,7 @@ CopyReadLine(CopyFromState cstate)
 	bool		result;
 
 	resetStringInfo(&cstate->line_buf);
-	cstate->line_buf_valid = true;
-
-	/* Mark that encoding conversion hasn't occurred yet */
-	cstate->line_buf_converted = false;
+	cstate->line_buf_valid = false;
 
 	/* Parse data and transfer into line_buf */
 	result = CopyReadLineText(cstate);
@@ -675,10 +985,17 @@ CopyReadLine(CopyFromState cstate)
 		 */
 		if (cstate->copy_src == COPY_FRONTEND)
 		{
+			int			inbytes;
+
 			do
 			{
-				cstate->raw_buf_index = cstate->raw_buf_len;
-			} while (CopyLoadRawBuf(cstate));
+				inbytes = CopyGetData(cstate, cstate->input_buf,
+									  1, INPUT_BUF_SIZE);
+			} while (inbytes > 0);
+			cstate->input_buf_index = 0;
+			cstate->input_buf_len = 0;
+			cstate->raw_buf_index = 0;
+			cstate->raw_buf_len = 0;
 		}
 	}
 	else
@@ -715,25 +1032,8 @@ CopyReadLine(CopyFromState cstate)
 		}
 	}
 
-	/* Done reading the line.  Convert it to server encoding. */
-	if (cstate->need_transcoding)
-	{
-		char	   *cvt;
-
-		cvt = pg_any_to_server(cstate->line_buf.data,
-							   cstate->line_buf.len,
-							   cstate->file_encoding);
-		if (cvt != cstate->line_buf.data)
-		{
-			/* transfer converted data back to line_buf */
-			resetStringInfo(&cstate->line_buf);
-			appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt));
-			pfree(cvt);
-		}
-	}
-
 	/* Now it's safe to use the buffer in error messages */
-	cstate->line_buf_converted = true;
+	cstate->line_buf_valid = true;
 
 	return result;
 }
@@ -744,13 +1044,12 @@ CopyReadLine(CopyFromState cstate)
 static bool
 CopyReadLineText(CopyFromState cstate)
 {
-	char	   *copy_raw_buf;
-	int			raw_buf_ptr;
+	char	   *copy_input_buf;
+	int			input_buf_ptr;
 	int			copy_buf_len;
 	bool		need_data = false;
 	bool		hit_eof = false;
 	bool		result = false;
-	char		mblen_str[2];
 
 	/* CSV variables */
 	bool		first_char_in_line = true;
@@ -768,8 +1067,6 @@ CopyReadLineText(CopyFromState cstate)
 			escapec = '\0';
 	}
 
-	mblen_str[1] = '\0';
-
 	/*
 	 * The objective of this loop is to transfer the entire next input line
 	 * into line_buf.  Hence, we only care for detecting newlines (\r and/or
@@ -782,18 +1079,25 @@ CopyReadLineText(CopyFromState cstate)
 	 * These four characters, and the CSV escape and quote characters, are
 	 * assumed the same in frontend and backend encodings.
 	 *
-	 * For speed, we try to move data from raw_buf to line_buf in chunks
-	 * rather than one character at a time.  raw_buf_ptr points to the next
-	 * character to examine; any characters from raw_buf_index to raw_buf_ptr
-	 * have been determined to be part of the line, but not yet transferred to
-	 * line_buf.
+	 * The input has already been converted to the database encoding.  All
+	 * supported server encodings have the property that all bytes in a
+	 * multi-byte sequence have the high bit set, so a multibyte character
+	 * cannot contain any newline or escape characters embedded in the
+	 * multibyte sequence.  Therefore, we can process the input byte-by-byte,
+	 * regardless of the encoding.
 	 *
-	 * For a little extra speed within the loop, we copy raw_buf and
-	 * raw_buf_len into local variables.
+	 * For speed, we try to move data from input_buf to line_buf in chunks
+	 * rather than one character at a time.  input_buf_ptr points to the next
+	 * character to examine; any characters from input_buf_index to
+	 * input_buf_ptr have been determined to be part of the line, but not yet
+	 * transferred to line_buf.
+	 *
+	 * For a little extra speed within the loop, we copy input_buf and
+	 * input_buf_len into local variables.
 	 */
-	copy_raw_buf = cstate->raw_buf;
-	raw_buf_ptr = cstate->raw_buf_index;
-	copy_buf_len = cstate->raw_buf_len;
+	copy_input_buf = cstate->input_buf;
+	input_buf_ptr = cstate->input_buf_index;
+	copy_buf_len = cstate->input_buf_len;
 
 	for (;;)
 	{
@@ -810,24 +1114,21 @@ CopyReadLineText(CopyFromState cstate)
 		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
 		 * considering the size of the buffer.
 		 */
-		if (raw_buf_ptr >= copy_buf_len || need_data)
+		if (input_buf_ptr >= copy_buf_len || need_data)
 		{
 			REFILL_LINEBUF;
 
-			/*
-			 * Try to read some more data.  This will certainly reset
-			 * raw_buf_index to zero, and raw_buf_ptr must go with it.
-			 */
-			if (!CopyLoadRawBuf(cstate))
-				hit_eof = true;
-			raw_buf_ptr = 0;
-			copy_buf_len = cstate->raw_buf_len;
+			CopyLoadInputBuf(cstate);
+			/* update our local variables */
+			hit_eof = cstate->input_reached_eof;
+			input_buf_ptr = cstate->input_buf_index;
+			copy_buf_len = cstate->input_buf_len;
 
 			/*
 			 * If we are completely out of data, break out of the loop,
 			 * reporting EOF.
 			 */
-			if (copy_buf_len <= 0)
+			if (INPUT_BUF_BYTES(cstate) <= 0)
 			{
 				result = true;
 				break;
@@ -836,8 +1137,8 @@ CopyReadLineText(CopyFromState cstate)
 		}
 
 		/* OK to fetch a character */
-		prev_raw_ptr = raw_buf_ptr;
-		c = copy_raw_buf[raw_buf_ptr++];
+		prev_raw_ptr = input_buf_ptr;
+		c = copy_input_buf[input_buf_ptr++];
 
 		if (cstate->opts.csv_mode)
 		{
@@ -891,16 +1192,16 @@ CopyReadLineText(CopyFromState cstate)
 				 * If need more data, go back to loop top to load it.
 				 *
 				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of raw_buf.
+				 * of the guaranteed pad of input_buf.
 				 */
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 
 				/* get next char */
-				c = copy_raw_buf[raw_buf_ptr];
+				c = copy_input_buf[input_buf_ptr];
 
 				if (c == '\n')
 				{
-					raw_buf_ptr++;	/* eat newline */
+					input_buf_ptr++;	/* eat newline */
 					cstate->eol_type = EOL_CRNL;	/* in case not set yet */
 				}
 				else
@@ -967,14 +1268,14 @@ CopyReadLineText(CopyFromState cstate)
 			/* -----
 			 * get next character
 			 * Note: we do not change c so if it isn't \., we can fall
-			 * through and continue processing for file encoding.
+			 * through and continue processing.
 			 * -----
 			 */
-			c2 = copy_raw_buf[raw_buf_ptr];
+			c2 = copy_input_buf[input_buf_ptr];
 
 			if (c2 == '.')
 			{
-				raw_buf_ptr++;	/* consume the '.' */
+				input_buf_ptr++;	/* consume the '.' */
 
 				/*
 				 * Note: if we loop back for more data here, it does not
@@ -986,7 +1287,7 @@ CopyReadLineText(CopyFromState cstate)
 					/* Get the next character */
 					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 					/* if hit_eof, c2 will become '\0' */
-					c2 = copy_raw_buf[raw_buf_ptr++];
+					c2 = copy_input_buf[input_buf_ptr++];
 
 					if (c2 == '\n')
 					{
@@ -1011,7 +1312,7 @@ CopyReadLineText(CopyFromState cstate)
 				/* Get the next character */
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 				/* if hit_eof, c2 will become '\0' */
-				c2 = copy_raw_buf[raw_buf_ptr++];
+				c2 = copy_input_buf[input_buf_ptr++];
 
 				if (c2 != '\r' && c2 != '\n')
 				{
@@ -1036,11 +1337,11 @@ CopyReadLineText(CopyFromState cstate)
 				 * Transfer only the data before the \. into line_buf, then
 				 * discard the data and the \. sequence.
 				 */
-				if (prev_raw_ptr > cstate->raw_buf_index)
+				if (prev_raw_ptr > cstate->input_buf_index)
 					appendBinaryStringInfo(&cstate->line_buf,
-										   cstate->raw_buf + cstate->raw_buf_index,
-										   prev_raw_ptr - cstate->raw_buf_index);
-				cstate->raw_buf_index = raw_buf_ptr;
+										   cstate->input_buf + cstate->input_buf_index,
+										   prev_raw_ptr - cstate->input_buf_index);
+				cstate->input_buf_index = input_buf_ptr;
 				result = true;	/* report EOF */
 				break;
 			}
@@ -1056,15 +1357,8 @@ CopyReadLineText(CopyFromState cstate)
 				 * backslashes are not special, so we want to process the
 				 * character after the backslash just like a normal character,
 				 * so we don't increment in those cases.
-				 *
-				 * Set 'c' to skip whole character correctly in multi-byte
-				 * encodings.  If we don't have the whole character in the
-				 * buffer yet, we might loop back to process it, after all,
-				 * but that's OK because multi-byte characters cannot have any
-				 * special meaning.
 				 */
-				raw_buf_ptr++;
-				c = c2;
+				input_buf_ptr++;
 			}
 		}
 
@@ -1075,30 +1369,6 @@ CopyReadLineText(CopyFromState cstate)
 		 * value, while in non-CSV mode, \. cannot be a data value.
 		 */
 not_end_of_copy:
-
-		/*
-		 * Process all bytes of a multi-byte character as a group.
-		 *
-		 * We only support multi-byte sequences where the first byte has the
-		 * high-bit set, so as an optimization we can avoid this block
-		 * entirely if it is not set.
-		 */
-		if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
-		{
-			int			mblen;
-
-			/*
-			 * It is enough to look at the first byte in all our encodings, to
-			 * get the length.  (GB18030 is a bit special, but still works for
-			 * our purposes; see comment in pg_gb18030_mblen())
-			 */
-			mblen_str[0] = c;
-			mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
-
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
-			IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
-			raw_buf_ptr += mblen - 1;
-		}
 		first_char_in_line = false;
 	}							/* end of outer loop */