aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2011-01-31 21:33:55 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2011-01-31 21:34:49 -0500
commit6e2f3ae8842392c46ccc91a9ce4bba92296890cb (patch)
tree5a672b0b9a6f1b5f0d1b264285cfb5a74574e13e
parent6238473adb119a5a11061e40e159e8c5187fe492 (diff)
downloadpostgresql-6e2f3ae8842392c46ccc91a9ce4bba92296890cb.tar.gz
postgresql-6e2f3ae8842392c46ccc91a9ce4bba92296890cb.zip
Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.
Unlike Btree-based LIKE optimization, this works for non-left-anchored search patterns. The effectiveness of the search depends on how many trigrams can be extracted from the pattern. (The worst case, with no trigrams, degrades to a full-table scan, so this isn't a panacea. But it can be very useful.) Alexander Korotkov, reviewed by Jan Urbanski
-rw-r--r--contrib/pg_trgm/expected/pg_trgm.out90
-rw-r--r--contrib/pg_trgm/pg_trgm.sql.in16
-rw-r--r--contrib/pg_trgm/sql/pg_trgm.sql25
-rw-r--r--contrib/pg_trgm/trgm.h22
-rw-r--r--contrib/pg_trgm/trgm_gin.c132
-rw-r--r--contrib/pg_trgm/trgm_gist.c95
-rw-r--r--contrib/pg_trgm/trgm_op.c286
-rw-r--r--contrib/pg_trgm/uninstall_pg_trgm.sql6
-rw-r--r--doc/src/sgml/pgtrgm.sgml18
9 files changed, 640 insertions, 50 deletions
diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out
index 0532a78e4ae..6e73af2a32f 100644
--- a/contrib/pg_trgm/expected/pg_trgm.out
+++ b/contrib/pg_trgm/expected/pg_trgm.out
@@ -3467,3 +3467,93 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
+create table test2(t text);
+insert into test2 values ('abcdef');
+insert into test2 values ('quark');
+create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+ select * from test2 where t like '%BCD%';
+ QUERY PLAN
+------------------------------------------
+ Bitmap Heap Scan on test2
+ Recheck Cond: (t ~~ '%BCD%'::text)
+ -> Bitmap Index Scan on test2_idx_gin
+ Index Cond: (t ~~ '%BCD%'::text)
+(4 rows)
+
+explain (costs off)
+ select * from test2 where t ilike '%BCD%';
+ QUERY PLAN
+-------------------------------------------
+ Bitmap Heap Scan on test2
+ Recheck Cond: (t ~~* '%BCD%'::text)
+ -> Bitmap Index Scan on test2_idx_gin
+ Index Cond: (t ~~* '%BCD%'::text)
+(4 rows)
+
+select * from test2 where t like '%BCD%';
+ t
+---
+(0 rows)
+
+select * from test2 where t like '%bcd%';
+ t
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike '%BCD%';
+ t
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike 'qua%';
+ t
+-------
+ quark
+(1 row)
+
+drop index test2_idx_gin;
+create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+ select * from test2 where t like '%BCD%';
+ QUERY PLAN
+------------------------------------------
+ Index Scan using test2_idx_gist on test2
+ Index Cond: (t ~~ '%BCD%'::text)
+(2 rows)
+
+explain (costs off)
+ select * from test2 where t ilike '%BCD%';
+ QUERY PLAN
+------------------------------------------
+ Index Scan using test2_idx_gist on test2
+ Index Cond: (t ~~* '%BCD%'::text)
+(2 rows)
+
+select * from test2 where t like '%BCD%';
+ t
+---
+(0 rows)
+
+select * from test2 where t like '%bcd%';
+ t
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike '%BCD%';
+ t
+--------
+ abcdef
+(1 row)
+
+select * from test2 where t ilike 'qua%';
+ t
+-------
+ quark
+(1 row)
+
diff --git a/contrib/pg_trgm/pg_trgm.sql.in b/contrib/pg_trgm/pg_trgm.sql.in
index 3e116e8306f..12a8c21071b 100644
--- a/contrib/pg_trgm/pg_trgm.sql.in
+++ b/contrib/pg_trgm/pg_trgm.sql.in
@@ -113,6 +113,8 @@ FOR TYPE text USING gist
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+ OPERATOR 3 pg_catalog.~~ (text, text),
+ OPERATOR 4 pg_catalog.~~* (text, text),
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
@@ -124,17 +126,17 @@ AS
STORAGE gtrgm;
-- support functions for gin
-CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal)
+CREATE OR REPLACE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
+CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
-CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
+CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
@@ -144,8 +146,10 @@ CREATE OPERATOR CLASS gin_trgm_ops
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
+ OPERATOR 3 pg_catalog.~~ (text, text),
+ OPERATOR 4 pg_catalog.~~* (text, text),
FUNCTION 1 btint4cmp (int4, int4),
- FUNCTION 2 gin_extract_trgm (text, internal),
- FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
- FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
+ FUNCTION 2 gin_extract_value_trgm (text, internal),
+ FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
+ FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
STORAGE int4;
diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql
index 5e5539c0057..b8209344c3f 100644
--- a/contrib/pg_trgm/sql/pg_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_trgm.sql
@@ -45,3 +45,28 @@ set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
+
+create table test2(t text);
+insert into test2 values ('abcdef');
+insert into test2 values ('quark');
+create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+ select * from test2 where t like '%BCD%';
+explain (costs off)
+ select * from test2 where t ilike '%BCD%';
+select * from test2 where t like '%BCD%';
+select * from test2 where t like '%bcd%';
+select * from test2 where t ilike '%BCD%';
+select * from test2 where t ilike 'qua%';
+drop index test2_idx_gin;
+create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+explain (costs off)
+ select * from test2 where t like '%BCD%';
+explain (costs off)
+ select * from test2 where t ilike '%BCD%';
+select * from test2 where t like '%BCD%';
+select * from test2 where t like '%bcd%';
+select * from test2 where t ilike '%BCD%';
+select * from test2 where t ilike 'qua%';
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index 1cc812554f7..f3644fcce76 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -13,12 +13,21 @@
#define LPADDING 2
#define RPADDING 1
#define KEEPONLYALNUM
+/*
+ * Caution: IGNORECASE macro means that trigrams are case-insensitive.
+ * If this macro is disabled, the ~~* operator must be removed from the
+ * operator classes, because we can't handle case-insensitive wildcard search
+ * with case-sensitive trigrams. Failure to do this will result in "cannot
+ * handle ~~* with case-sensitive trigrams" errors.
+ */
#define IGNORECASE
#define DIVUNION
/* operator strategy numbers */
-#define SimilarityStrategyNumber 1
-#define DistanceStrategyNumber 2
+#define SimilarityStrategyNumber 1
+#define DistanceStrategyNumber 2
+#define LikeStrategyNumber 3
+#define ILikeStrategyNumber 4
typedef char trgm[3];
@@ -40,7 +49,10 @@ uint32 trgm2int(trgm *ptr);
#else
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
#endif
-#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
+#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
+
+#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
+#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */
typedef struct
{
@@ -65,7 +77,7 @@ typedef char *BITVECP;
for(i=0;i<SIGLEN;i++)
#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
-#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
+#define GETBITBYTE(x,i) ( (((char)(x)) >> (i)) & 0x01 )
#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) )
#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
@@ -89,6 +101,8 @@ typedef char *BITVECP;
extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
+TRGM *generate_wildcard_trgm(const char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
+bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
#endif /* __TRGM_H__ */
diff --git a/contrib/pg_trgm/trgm_gin.c b/contrib/pg_trgm/trgm_gin.c
index 2d3e35e2330..cdaa72ea611 100644
--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -7,6 +7,7 @@
#include "access/gin.h"
#include "access/itup.h"
+#include "access/skey.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
#include "utils/array.h"
@@ -16,15 +17,32 @@
PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
+Datum gin_extract_value_trgm(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
+Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
+
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
/*
- * This function is used as both extractValue and extractQuery
+ * This function can only be called if a pre-9.1 version of the GIN operator
+ * class definition is present in the catalogs (probably as a consequence
+ * of upgrade-in-place). Complain.
*/
Datum
gin_extract_trgm(PG_FUNCTION_ARGS)
{
+ ereport(ERROR,
+ (errmsg("GIN operator class for pg_trgm is out of date"),
+ errhint("Please drop and re-create the pg_trgm catalog entries.")));
+ PG_RETURN_NULL();
+}
+
+Datum
+gin_extract_value_trgm(PG_FUNCTION_ARGS)
+{
text *val = (text *) PG_GETARG_TEXT_P(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
Datum *entries = NULL;
@@ -58,33 +76,123 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
}
Datum
+gin_extract_query_trgm(PG_FUNCTION_ARGS)
+{
+ text *val = (text *) PG_GETARG_TEXT_P(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+ /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
+ /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
+ /* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
+ int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
+ Datum *entries = NULL;
+ TRGM *trg;
+ int32 trglen;
+ trgm *ptr;
+ int32 i;
+
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+ break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+ /*
+ * For wildcard search we extract all the trigrams that every
+ * potentially-matching string must include.
+ */
+ trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ trg = NULL; /* keep compiler quiet */
+ break;
+ }
+
+ trglen = ARRNELEM(trg);
+ *nentries = trglen;
+
+ if (trglen > 0)
+ {
+ entries = (Datum *) palloc(sizeof(Datum) * trglen);
+ ptr = GETARR(trg);
+ for (i = 0; i < trglen; i++)
+ {
+ int32 item = trgm2int(ptr);
+
+ entries[i] = Int32GetDatum(item);
+ ptr++;
+ }
+ }
+
+ /*
+ * If no trigram was extracted then we have to scan all the index.
+ */
+ if (trglen == 0)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+
+ PG_RETURN_POINTER(entries);
+}
+
+Datum
gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
- /* StrategyNumber strategy = PG_GETARG_UINT16(1); */
+ StrategyNumber strategy = PG_GETARG_UINT16(1);
/* text *query = PG_GETARG_TEXT_P(2); */
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
- bool res = FALSE;
+ bool res;
int32 i,
- ntrue = 0;
+ ntrue;
/* All cases served by this function are inexact */
*recheck = true;
- /* Count the matches */
- for (i = 0; i < nkeys; i++)
+ switch (strategy)
{
- if (check[i])
- ntrue++;
- }
-
+ case SimilarityStrategyNumber:
+ /* Count the matches */
+ ntrue = 0;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (check[i])
+ ntrue++;
+ }
#ifdef DIVUNION
- res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
+ res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
#else
- res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
+ res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
#endif
+ break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+ /* Check if all extracted trigrams are presented. */
+ res = true;
+ for (i = 0; i < nkeys; i++)
+ {
+ if (!check[i])
+ {
+ res = false;
+ break;
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ res = false; /* keep compiler quiet */
+ break;
+ }
PG_RETURN_BOOL(res);
}
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index d9f3d40c179..d83265c11c3 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -195,31 +195,61 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
- char *cache = (char *) fcinfo->flinfo->fn_extra;
-
- /* All cases served by this function are exact */
- *recheck = false;
-
- if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
+ char *cache = (char *) fcinfo->flinfo->fn_extra,
+ *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
+
+ /*
+ * Store both the strategy number and extracted trigrams in cache, because
+ * trigram extraction is relatively CPU-expensive. We must include
+ * strategy number because trigram extraction depends on strategy.
+ */
+ if (cache == NULL || strategy != *((StrategyNumber *) cache) ||
+ VARSIZE(cacheContents) != VARSIZE(query) ||
+ memcmp(cacheContents, query, VARSIZE(query)) != 0)
{
- qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+ break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+ qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ qtrg = NULL; /* keep compiler quiet */
+ break;
+ }
if (cache)
pfree(cache);
- fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
- MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
+ fcinfo->flinfo->fn_extra =
+ MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
+ MAXALIGN(sizeof(StrategyNumber)) +
+ MAXALIGN(VARSIZE(query)) +
+ VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
+ cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
- memcpy(cache, query, VARSIZE(query));
- memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
+ *((StrategyNumber *) cache) = strategy;
+ memcpy(cacheContents, query, VARSIZE(query));
+ memcpy(cacheContents + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
- qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
+ qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
+ /* Similarity search is exact */
+ *recheck = false;
+
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
@@ -242,6 +272,47 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
}
break;
+ case ILikeStrategyNumber:
+#ifndef IGNORECASE
+ elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
+#endif
+ /* FALL THRU */
+ case LikeStrategyNumber:
+ /* Wildcard search is inexact */
+ *recheck = true;
+
+ /*
+ * Check if all the extracted trigrams can be present in child
+ * nodes.
+ */
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ res = trgm_contained_by(qtrg, key);
+ }
+ else if (ISALLTRUE(key))
+ { /* non-leaf contains signature */
+ res = true;
+ }
+ else
+ { /* non-leaf contains signature */
+ int32 k,
+ tmp = 0,
+ len = ARRNELEM(qtrg);
+ trgm *ptr = GETARR(qtrg);
+ BITVECP sign = GETSIGN(key);
+
+ res = true;
+ for (k = 0; k < len; k++)
+ {
+ CPTRGM(((char *) &tmp), ptr + k);
+ if (!GETBIT(sign, HASHVAL(tmp)))
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index b97e9912bad..52f9172f6d5 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -18,6 +18,23 @@ float4 trgm_limit = 0.3f;
PG_FUNCTION_INFO_V1(set_limit);
Datum set_limit(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(show_limit);
+Datum show_limit(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(show_trgm);
+Datum show_trgm(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity);
+Datum similarity(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity_dist);
+Datum similarity_dist(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(similarity_op);
+Datum similarity_op(PG_FUNCTION_ARGS);
+
+
Datum
set_limit(PG_FUNCTION_ARGS)
{
@@ -29,8 +46,6 @@ set_limit(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(trgm_limit);
}
-PG_FUNCTION_INFO_V1(show_limit);
-Datum show_limit(PG_FUNCTION_ARGS);
Datum
show_limit(PG_FUNCTION_ARGS)
{
@@ -120,7 +135,7 @@ cnt_trigram(trgm *tptr, char *str, int bytelen)
#endif
/*
- * Adds trigramm from words (already padded).
+ * Adds trigrams from words (already padded).
*/
static trgm *
make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
@@ -236,6 +251,225 @@ generate_trgm(char *str, int slen)
return trg;
}
+/*
+ * Extract the next non-wildcard part of a search string, ie, a word bounded
+ * by '_' or '%' meta-characters, non-word characters or string end.
+ *
+ * str: source string, of length lenstr bytes (need not be null-terminated)
+ * buf: where to return the substring (must be long enough)
+ * *bytelen: receives byte length of the found substring
+ * *charlen: receives character length of the found substring
+ *
+ * Returns pointer to end+1 of the found substring in the source string.
+ * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
+ *
+ * If the found word is bounded by non-word characters or string boundaries
+ * then this function will include corresponding padding spaces into buf.
+ */
+static const char *
+get_wildcard_part(const char *str, int lenstr,
+ char *buf, int *bytelen, int *charlen)
+{
+ const char *beginword = str;
+ const char *endword;
+ char *s = buf;
+ bool in_wildcard_meta = false;
+ bool in_escape = false;
+ int clen;
+
+ /*
+ * Find the first word character remembering whether last character was
+ * wildcard meta-character.
+ */
+ while (beginword - str < lenstr)
+ {
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard_meta = false;
+ if (iswordchr(beginword))
+ break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(beginword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(beginword))
+ in_wildcard_meta = true;
+ else if (iswordchr(beginword))
+ break;
+ else
+ in_wildcard_meta = false;
+ }
+ beginword += pg_mblen(beginword);
+ }
+
+ /*
+ * Handle string end.
+ */
+ if (beginword - str >= lenstr)
+ return NULL;
+
+ /*
+ * Add left padding spaces if last character wasn't wildcard
+ * meta-character.
+ */
+ *charlen = 0;
+ if (!in_wildcard_meta)
+ {
+ if (LPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (LPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+
+ /*
+ * Copy data into buf until wildcard meta-character, non-word character or
+ * string boundary. Strip escapes during copy.
+ */
+ endword = beginword;
+ in_wildcard_meta = false;
+ in_escape = false;
+ while (endword - str < lenstr)
+ {
+ clen = pg_mblen(endword);
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard_meta = false;
+ if (iswordchr(endword))
+ {
+ memcpy(s, endword, clen);
+ (*charlen)++;
+ s += clen;
+ }
+ else
+ break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(endword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(endword))
+ {
+ in_wildcard_meta = true;
+ break;
+ }
+ else if (iswordchr(endword))
+ {
+ memcpy(s, endword, clen);
+ (*charlen)++;
+ s += clen;
+ }
+ else
+ {
+ in_wildcard_meta = false;
+ break;
+ }
+ }
+ endword += clen;
+ }
+
+ /*
+ * Add right padding spaces if last character wasn't wildcard
+ * meta-character.
+ */
+ if (!in_wildcard_meta)
+ {
+ if (RPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (RPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+
+ *bytelen = s - buf;
+ return endword;
+}
+
+/*
+ * Generates trigrams for wildcard search string.
+ *
+ * Returns array of trigrams that must occur in any string that matches the
+ * wildcard string. For example, given pattern "a%bcd%" the trigrams
+ * " a", "bcd" would be extracted.
+ */
+TRGM *
+generate_wildcard_trgm(const char *str, int slen)
+{
+ TRGM *trg;
+ char *buf,
+ *buf2;
+ trgm *tptr;
+ int len,
+ charlen,
+ bytelen;
+ const char *eword;
+
+ trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
+ trg->flag = ARRKEY;
+ SET_VARSIZE(trg, TRGMHDRSIZE);
+
+ if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ return trg;
+
+ tptr = GETARR(trg);
+
+ buf = palloc(sizeof(char) * (slen + 4));
+
+ /*
+ * Extract trigrams from each substring extracted by get_wildcard_part.
+ */
+ eword = str;
+ while ((eword = get_wildcard_part(eword, slen - (eword - str),
+ buf, &bytelen, &charlen)) != NULL)
+ {
+#ifdef IGNORECASE
+ buf2 = lowerstr_with_len(buf, bytelen);
+ bytelen = strlen(buf2);
+#else
+ buf2 = buf;
+#endif
+
+ /*
+ * count trigrams
+ */
+ tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+#ifdef IGNORECASE
+ pfree(buf2);
+#endif
+ }
+
+ pfree(buf);
+
+ if ((len = tptr - GETARR(trg)) == 0)
+ return trg;
+
+ /*
+ * Make trigrams unique.
+ */
+ if (len > 0)
+ {
+ qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+ len = unique_array(GETARR(trg), len);
+ }
+
+ SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+ return trg;
+}
+
uint32
trgm2int(trgm *ptr)
{
@@ -250,8 +484,6 @@ trgm2int(trgm *ptr)
return val;
}
-PG_FUNCTION_INFO_V1(show_trgm);
-Datum show_trgm(PG_FUNCTION_ARGS);
Datum
show_trgm(PG_FUNCTION_ARGS)
{
@@ -340,8 +572,44 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
}
-PG_FUNCTION_INFO_V1(similarity);
-Datum similarity(PG_FUNCTION_ARGS);
+/*
+ * Returns whether trg2 contains all trigrams in trg1.
+ * This relies on the trigram arrays being sorted.
+ */
+bool
+trgm_contained_by(TRGM *trg1, TRGM *trg2)
+{
+ trgm *ptr1,
+ *ptr2;
+ int len1,
+ len2;
+
+ ptr1 = GETARR(trg1);
+ ptr2 = GETARR(trg2);
+
+ len1 = ARRNELEM(trg1);
+ len2 = ARRNELEM(trg2);
+
+ while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+ {
+ int res = CMPTRGM(ptr1, ptr2);
+
+ if (res < 0)
+ return false;
+ else if (res > 0)
+ ptr2++;
+ else
+ {
+ ptr1++;
+ ptr2++;
+ }
+ }
+ if (ptr1 - GETARR(trg1) < len1)
+ return false;
+ else
+ return true;
+}
+
Datum
similarity(PG_FUNCTION_ARGS)
{
@@ -364,8 +632,6 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(res);
}
-PG_FUNCTION_INFO_V1(similarity_dist);
-Datum similarity_dist(PG_FUNCTION_ARGS);
Datum
similarity_dist(PG_FUNCTION_ARGS)
{
@@ -375,8 +641,6 @@ similarity_dist(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(1.0 - res);
}
-PG_FUNCTION_INFO_V1(similarity_op);
-Datum similarity_op(PG_FUNCTION_ARGS);
Datum
similarity_op(PG_FUNCTION_ARGS)
{
diff --git a/contrib/pg_trgm/uninstall_pg_trgm.sql b/contrib/pg_trgm/uninstall_pg_trgm.sql
index bc8f1fa983d..961e40ca487 100644
--- a/contrib/pg_trgm/uninstall_pg_trgm.sql
+++ b/contrib/pg_trgm/uninstall_pg_trgm.sql
@@ -25,11 +25,11 @@ DROP TYPE gtrgm CASCADE;
DROP OPERATOR CLASS gin_trgm_ops USING gin;
-DROP FUNCTION gin_extract_trgm(text, internal);
+DROP FUNCTION gin_extract_value_trgm(text, internal);
-DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal);
+DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal);
-DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal);
+DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal);
DROP OPERATOR % (text, text);
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml
index 27a23067205..9210899ec24 100644
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -137,8 +137,10 @@
The <filename>pg_trgm</filename> module provides GiST and GIN index
operator classes that allow you to create an index over a text column for
the purpose of very fast similarity searches. These index types support
- the above-described similarity operators (and no other operators, so you may
- want a regular B-tree index too).
+ the above-described similarity operators, and additionally support
+ trigram-based index searches for <literal>LIKE</> and <literal>ILIKE</>
+ queries. (These indexes do not support equality nor simple comparison
+ operators, so you may need a regular B-tree index too.)
</para>
<para>
@@ -182,6 +184,18 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
</para>
<para>
+ Beginning in <productname>PostgreSQL</> 9.1, these index types also support
+ index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
+<programlisting>
+SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
+</programlisting>
+ The index search works by extracting trigrams from the search string
+ and then looking these up in the index. The more trigrams in the search
+ string, the more effective the index search is. Unlike B-tree based
+ searches, the search string need not be left-anchored.
+ </para>
+
+ <para>
The choice between GiST and GIN indexing depends on the relative
performance characteristics of GiST and GIN, which are discussed elsewhere.
As a rule of thumb, a GIN index is faster to search than a GiST index, but