Reduce memory usage of tsvector type analyze function.

compute_tsvector_stats() detoasted and kept in memory every tsvector value in the sample, but that can be a lot of memory. The original bug report described a case using over 10 gigabytes, with statistics target of 10000 (the maximum). To fix, allocate a separate copy of just the lexemes that we keep around, and free the detoasted tsvector values as we go. This adds some palloc/pfree overhead, when you have a lot of distinct lexemes in the sample, but it's better than running out of memory. Fixes bug #14654 reported by James C. Reviewed by Tom Lane. Backport to all supported versions. Discussion: https://www.postgresql.org/message-id/20170514200602.1451.46797@wrigleys.postgresql.org
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2017-07-12 22:03:38 +0300
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2017-07-12 22:06:13 +0300
commit: da11977de9c685ef808d3a293727f9ce26753ec4 (patch)
tree: eea53c3ea5369db5ebb489e3133f7c234c102b62 /src
parent: ca793c59a51e94cedf8cbea5c29668bf8fa298f3 (diff)
download: postgresql-da11977de9c685ef808d3a293727f9ce26753ec4.tar.gz
postgresql-da11977de9c685ef808d3a293727f9ce26753ec4.zip
1 files changed, 17 insertions, 4 deletions
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index ab224b76b86..320c7f1a616 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -232,9 +232,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 
 		/*
 		 * We loop through the lexemes in the tsvector and add them to our
-		 * tracking hashtable.  Note: the hashtable entries will point into
-		 * the (detoasted) tsvector value, therefore we cannot free that
-		 * storage until we're done.
+		 * tracking hashtable.
 		 */
 		lexemesptr = STRPTR(vector);
 		curentryptr = ARRPTR(vector);
@@ -242,7 +240,12 @@ compute_tsvector_stats(VacAttrStats *stats,
 		{
 			bool		found;
 
-			/* Construct a hash key */
+			/*
+			 * Construct a hash key.  The key points into the (detoasted)
+			 * tsvector value at this point, but if a new entry is created, we
+			 * make a copy of it.  This way we can free the tsvector value
+			 * once we've processed all its lexemes.
+			 */
 			hash_key.lexeme = lexemesptr + curentryptr->pos;
 			hash_key.length = curentryptr->len;
 
@@ -261,6 +264,9 @@ compute_tsvector_stats(VacAttrStats *stats,
 				/* Initialize new tracking list element */
 				item->frequency = 1;
 				item->delta = b_current - 1;
+
+				item->key.lexeme = palloc(hash_key.length);
+				memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length);
 			}
 
 			/* lexeme_no is the number of elements processed (ie N) */
@@ -276,6 +282,10 @@ compute_tsvector_stats(VacAttrStats *stats,
 			/* Advance to the next WordEntry in the tsvector */
 			curentryptr++;
 		}
+
+		/* If the vector was toasted, free the detoasted copy. */
+		if (TSVectorGetDatum(vector) != value)
+			pfree(vector);
 	}
 
 	/* We can only compute real stats if we found some non-null values. */
@@ -447,9 +457,12 @@ prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current)
 	{
 		if (item->frequency + item->delta <= b_current)
 		{
+			char	   *lexeme = item->key.lexeme;
+
 			if (hash_search(lexemes_tab, (const void *) &item->key,
 							HASH_REMOVE, NULL) == NULL)
 				elog(ERROR, "hash table corrupted");
+			pfree(lexeme);
 		}
 	}
 }
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2017-07-12 22:03:38 +0300
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2017-07-12 22:06:13 +0300
commit	da11977de9c685ef808d3a293727f9ce26753ec4 (patch)
tree	eea53c3ea5369db5ebb489e3133f7c234c102b62 /src
parent	ca793c59a51e94cedf8cbea5c29668bf8fa298f3 (diff)
download	postgresql-da11977de9c685ef808d3a293727f9ce26753ec4.tar.gz postgresql-da11977de9c685ef808d3a293727f9ce26753ec4.zip