aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsvector_parser.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2007-10-21 22:29:56 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2007-10-21 22:29:56 +0000
commit1ea47dd8cbcb485a06676f12a36244270e18e192 (patch)
treeafcd226d47815acf89a4a62442ecdd1467594edb /src/backend/utils/adt/tsvector_parser.c
parentdfc6f130b4fe64cbaff379eb1a2a97f6e050eb3e (diff)
downloadpostgresql-1ea47dd8cbcb485a06676f12a36244270e18e192.tar.gz
postgresql-1ea47dd8cbcb485a06676f12a36244270e18e192.zip
Fix shared tsvector/tsquery input code so that we don't say "syntax error in
tsvector" when we are really parsing a tsquery. Report the bogus input, too. Make styles of some related error messages more consistent.
Diffstat (limited to 'src/backend/utils/adt/tsvector_parser.c')
-rw-r--r--src/backend/utils/adt/tsvector_parser.c115
1 files changed, 63 insertions, 52 deletions
diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c
index 26a271679d4..5ee8bb78422 100644
--- a/src/backend/utils/adt/tsvector_parser.c
+++ b/src/backend/utils/adt/tsvector_parser.c
@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09:56 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.2 2007/10/21 22:29:56 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -20,35 +20,49 @@
#include "tsearch/ts_utils.h"
#include "utils/memutils.h"
+
+/*
+ * Private state of tsvector parser. Note that tsquery also uses this code to
+ * parse its input, hence the boolean flags. The two flags are both true or
+ * both false in current usage, but we keep them separate for clarity.
+ * is_tsquery affects *only* the content of error messages.
+ */
struct TSVectorParseStateData
{
- char *prsbuf;
- char *word; /* buffer to hold the current word */
- int len; /* size in bytes allocated for 'word' */
- bool oprisdelim;
+ char *prsbuf; /* next input character */
+ char *bufstart; /* whole string (used only for errors) */
+ char *word; /* buffer to hold the current word */
+ int len; /* size in bytes allocated for 'word' */
+ int eml; /* max bytes per character */
+ bool oprisdelim; /* treat ! | * ( ) as delimiters? */
+ bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
};
+
/*
* Initializes parser for the input string. If oprisdelim is set, the
* following characters are treated as delimiters in addition to whitespace:
* ! | & ( )
*/
TSVectorParseState
-init_tsvector_parser(char *input, bool oprisdelim)
+init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
{
TSVectorParseState state;
state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
state->prsbuf = input;
+ state->bufstart = input;
state->len = 32;
state->word = (char *) palloc(state->len);
+ state->eml = pg_database_encoding_max_length();
state->oprisdelim = oprisdelim;
+ state->is_tsquery = is_tsquery;
return state;
}
/*
- * Reinitializes parser for parsing 'input', instead of previous input.
+ * Reinitializes parser to parse 'input', instead of previous input.
*/
void
reset_tsvector_parser(TSVectorParseState state, char *input)
@@ -66,21 +80,21 @@ close_tsvector_parser(TSVectorParseState state)
pfree(state);
}
+/* increase the size of 'word' if needed to hold one more character */
#define RESIZEPRSBUF \
do { \
- if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
+ int clen = curpos - state->word; \
+ if ( clen + state->eml >= state->len ) \
{ \
- int clen = curpos - state->word; \
state->len *= 2; \
- state->word = (char*)repalloc( (void*)state->word, state->len ); \
+ state->word = (char *) repalloc(state->word, state->len); \
curpos = state->word + clen; \
} \
} while (0)
-
#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
-/* Fills the output parameters, and returns true */
+/* Fills gettoken_tsvector's output parameters, and returns true */
#define RETURN_TOKEN \
do { \
if (pos_ptr != NULL) \
@@ -111,18 +125,34 @@ do { \
#define WAITPOSDELIM 7
#define WAITCHARCMPLX 8
+#define PRSSYNTAXERROR prssyntaxerror(state)
+
+static void
+prssyntaxerror(TSVectorParseState state)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ state->is_tsquery ?
+ errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
+ errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
+}
+
+
/*
- * Get next token from string being parsed. Returns false if
- * end of input string is reached, otherwise strval, lenval, pos_ptr
- * and poslen output parameters are filled in:
+ * Get next token from string being parsed. Returns true if successful,
+ * false if end of input string is reached. On success, these output
+ * parameters are filled in:
*
- * *strval token
- * *lenval length of*strval
+ * *strval pointer to token
+ * *lenval length of *strval
* *pos_ptr pointer to a palloc'd array of positions and weights
* associated with the token. If the caller is not interested
* in the information, NULL can be supplied. Otherwise
* the caller is responsible for pfreeing the array.
* *poslen number of elements in *pos_ptr
+ * *endptr scan resumption point
+ *
+ * Pass NULL for unwanted output parameters.
*/
bool
gettoken_tsvector(TSVectorParseState state,
@@ -155,9 +185,7 @@ gettoken_tsvector(TSVectorParseState state,
oldstate = WAITENDWORD;
}
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
else if (!t_isspace(state->prsbuf))
{
COPYCHAR(curpos, state->prsbuf);
@@ -170,7 +198,8 @@ gettoken_tsvector(TSVectorParseState state,
if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("there is no escaped character")));
+ errmsg("there is no escaped character: \"%s\"",
+ state->bufstart)));
else
{
RESIZEPRSBUF;
@@ -192,18 +221,14 @@ gettoken_tsvector(TSVectorParseState state,
{
RESIZEPRSBUF;
if (curpos == state->word)
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
*(curpos) = '\0';
RETURN_TOKEN;
}
else if (t_iseq(state->prsbuf, ':'))
{
if (curpos == state->word)
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
*(curpos) = '\0';
if (state->oprisdelim)
RETURN_TOKEN;
@@ -229,9 +254,7 @@ gettoken_tsvector(TSVectorParseState state,
oldstate = WAITENDCMPLX;
}
else if (*(state->prsbuf) == '\0')
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
else
{
RESIZEPRSBUF;
@@ -253,9 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
RESIZEPRSBUF;
*(curpos) = '\0';
if (curpos == state->word)
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
if (state->oprisdelim)
{
/* state->prsbuf+=pg_mblen(state->prsbuf); */
@@ -290,17 +311,17 @@ gettoken_tsvector(TSVectorParseState state,
}
npos++;
WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
+ /* we cannot get here in tsquery, so no need for 2 errmsgs */
if (WEP_GETPOS(pos[npos - 1]) == 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("wrong position info in tsvector")));
+ errmsg("wrong position info in tsvector: \"%s\"",
+ state->bufstart)));
WEP_SETWEIGHT(pos[npos - 1], 0);
statecode = WAITPOSDELIM;
}
else
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
}
else if (statecode == WAITPOSDELIM)
{
@@ -309,42 +330,32 @@ gettoken_tsvector(TSVectorParseState state,
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 3);
}
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 2);
}
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 1);
}
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
{
if (WEP_GETWEIGHT(pos[npos - 1]))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
WEP_SETWEIGHT(pos[npos - 1], 0);
}
else if (t_isspace(state->prsbuf) ||
*(state->prsbuf) == '\0')
RETURN_TOKEN;
else if (!t_isdigit(state->prsbuf))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("syntax error in tsvector")));
+ PRSSYNTAXERROR;
}
else /* internal error */
elog(ERROR, "internal error in gettoken_tsvector");