aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Momjian <bruce@momjian.us>2002-08-15 03:02:08 +0000
committerBruce Momjian <bruce@momjian.us>2002-08-15 03:02:08 +0000
commit2860041bf0c132602c1bc805386984faa2a26e55 (patch)
tree90b6539be2d92293c27ca2df489d89edb4484cbd
parent1276356268bb99507d7d54f80540b8e0835c026b (diff)
downloadpostgresql-2860041bf0c132602c1bc805386984faa2a26e55.tar.gz
postgresql-2860041bf0c132602c1bc805386984faa2a26e55.zip
August 13, 2002
Use parser of OpenFTS v0.33. -- Teodor Sigaev
-rw-r--r--contrib/tsearch/README.tsearch5
-rw-r--r--contrib/tsearch/deflex.h31
-rw-r--r--contrib/tsearch/expected/tsearch.out8
-rw-r--r--contrib/tsearch/morph.c24
-rw-r--r--contrib/tsearch/parser.l161
5 files changed, 129 insertions, 100 deletions
diff --git a/contrib/tsearch/README.tsearch b/contrib/tsearch/README.tsearch
index c63ae91edd0..a57df55eea7 100644
--- a/contrib/tsearch/README.tsearch
+++ b/contrib/tsearch/README.tsearch
@@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access.
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
(oleg@sai.msu.su).
+CHANGES:
+
+August 13, 2002
+ Use parser of OpenFTS v0.33.
+
IMPORTANT NOTICE:
This is a first step of our work on integration of OpenFTS
diff --git a/contrib/tsearch/deflex.h b/contrib/tsearch/deflex.h
index f9d68471679..17c4fdf1ec3 100644
--- a/contrib/tsearch/deflex.h
+++ b/contrib/tsearch/deflex.h
@@ -2,28 +2,33 @@
#define __DEFLEX_H__
/* rememder !!!! */
-#define LASTNUM 19
+#define LASTNUM 23
#define LATWORD 1
-#define NONLATINWORD 2
+#define CYRWORD 2
#define UWORD 3
#define EMAIL 4
#define FURL 5
#define HOST 6
-#define FLOAT 7
-#define FINT 8
-#define PARTWORD 9
-#define NONLATINPARTWORD 10
-#define LATPARTWORD 11
-#define SPACE 12
-#define SYMTAG 13
-#define HTTP 14
-#define DEFISWORD 15
-#define DEFISLATWORD 16
-#define DEFISNONLATINWORD 17
+#define SCIENTIFIC 7
+#define VERSIONNUMBER 8
+#define PARTHYPHENWORD 9
+#define CYRPARTHYPHENWORD 10
+#define LATPARTHYPHENWORD 11
+#define SPACE 12
+#define TAG 13
+#define HTTP 14
+#define HYPHENWORD 15
+#define LATHYPHENWORD 16
+#define CYRHYPHENWORD 17
#define URI 18
#define FILEPATH 19
+#define DECIMAL 20
+#define SIGNEDINT 21
+#define UNSIGNEDINT 22
+#define HTMLENTITY 23
extern const char *descr[];
#endif
+
diff --git a/contrib/tsearch/expected/tsearch.out b/contrib/tsearch/expected/tsearch.out
index f75b429bcbb..0b12765d8f6 100644
--- a/contrib/tsearch/expected/tsearch.out
+++ b/contrib/tsearch/expected/tsearch.out
@@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)';
select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
- txt2txtidx
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
+ txt2txtidx
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
(1 row)
select txtidxsize(txt2txtidx('345 qw'));
@@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e
<i <b> wow < jqw <> qwerty'));
txtidxsize
------------
- 52
+ 53
(1 row)
insert into test_txtidx (a) values ('345 qwerty');
diff --git a/contrib/tsearch/morph.c b/contrib/tsearch/morph.c
index 60797b07e92..b29a3f6779d 100644
--- a/contrib/tsearch/morph.c
+++ b/contrib/tsearch/morph.c
@@ -75,19 +75,23 @@ static MAPDICT mapdict[] = {
{NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */
- {NODICT, NODICT}, /* FLOAT */
- {NODICT, NODICT}, /* FINT */
- {BYLOCALE, DEFAULTDICT}, /* PARTWORD */
- {BYLOCALE, NODICT}, /* NONLATINPARTWORD */
- {DEFAULTDICT, NODICT}, /* LATPARTWORD */
+ {NODICT, NODICT}, /* SCIENTIFIC */
+ {NODICT, NODICT}, /* VERSIONNUMBER */
+ {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
+ {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
+ {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
{STOPLEXEM, NODICT}, /* SPACE */
- {STOPLEXEM, NODICT}, /* SYMTAG */
+ {STOPLEXEM, NODICT}, /* TAG */
{STOPLEXEM, NODICT}, /* HTTP */
- {BYLOCALE, DEFAULTDICT}, /* DEFISWORD */
- {DEFAULTDICT, NODICT}, /* DEFISLATWORD */
- {BYLOCALE, NODICT}, /* DEFISNONLATINWORD */
+ {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
+ {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
+ {BYLOCALE, NODICT}, /* CYRHYPHENWORD */
{NODICT, NODICT}, /* URI */
- {NODICT, NODICT} /* FILEPATH */
+ {NODICT, NODICT}, /* FILEPATH */
+ {NODICT, NODICT}, /* DECIMAL */
+ {NODICT, NODICT}, /* SIGNEDINT */
+ {NODICT, NODICT}, /* UNSIGNEDINT */
+ {STOPLEXEM, NODICT} /* HTMLENTITY */
};
static bool inited = false;
diff --git a/contrib/tsearch/parser.l b/contrib/tsearch/parser.l
index 6081fd4c7be..f30fbcd4f46 100644
--- a/contrib/tsearch/parser.l
+++ b/contrib/tsearch/parser.l
@@ -5,18 +5,17 @@
/* postgres allocation function */
#include "postgres.h"
-#define free pfree
-#define malloc palloc
+#define free pfree
+#define malloc palloc
#define realloc repalloc
#ifdef strdup
#undef strdup
#endif
-#define strdup pstrdup
-
+#define strdup pstrdup
char *token = NULL; /* pointer to token */
-char *s = NULL; /* for returning full defis-word */
+char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
@@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */
%option nounput
%option noyywrap
-
-/* parser's state for parsing defis-word */
+/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
-/* parser's state for parsing filepath */
-
+/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
+%x INCOMMENT
+%x INSCRIPT
-/* NONLATIN char */
-NONLATINALNUM [0-9\200-\377]
-NONLATINALPHA [\200-\377]
+/* cyrillic koi8 char */
+CYRALNUM [0-9\200-\377]
+CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
@@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+
%%
-"<"[[:alpha:]] { BEGIN INTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
-
-"</"[[:alpha:]] { BEGIN INTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
-"<>" {
+<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
+ BEGIN INITIAL;
+ *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return SYMTAG;
+ return SPACE;
}
-"<"[^>[:alpha:]] {
+"<!--" { BEGIN INCOMMENT; }
+
+<INCOMMENT>"-->" {
+ BEGIN INITIAL;
+ *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
-<INTAG>"\"" { BEGIN QINTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
-<QINTAG>"\\\"" {
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
-}
+"<"[\![:alpha:]] { BEGIN INTAG; }
-<QINTAG>"\"" { BEGIN INTAG;
- token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
+"</"[[:alpha:]] { BEGIN INTAG; }
-<QINTAG>.|\n {
+<INTAG>"\"" { BEGIN QINTAG; }
+
+<QINTAG>"\\\"" ;
+
+<QINTAG>"\"" { BEGIN INTAG; }
+
+<INTAG>">" {
+ BEGIN INITIAL;
token = tsearch_yytext;
- tokenlen = tsearch_yyleng;
- return SYMTAG;
+ *tsearch_yytext=' ';
+ token = tsearch_yytext;
+ tokenlen = 1;
+ return TAG;
}
-<INTAG>">" { BEGIN INITIAL;
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
+
+\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return SYMTAG;
- }
+ return HTMLENTITY;
+}
-<INTAG>.|\n {
+\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return SYMTAG;
+ return HTMLENTITY;
}
-
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch_yytext;
@@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+
return EMAIL;
}
-<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ {
+[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return FINT;
+ return SCIENTIFIC;
+}
+
+[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return VERSIONNUMBER;
+}
+
+[+-]?[0-9]+\.[0-9]+ {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return DECIMAL;
}
-<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ {
+[+-][0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return FINT;
+ return SIGNEDINT;
}
-[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ {
+<DELIM,INITIAL>[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return FLOAT;
+ return UNSIGNEDINT;
}
http"://" {
@@ -208,52 +212,58 @@ ftp"://" {
return FILEPATH;
}
-({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ {
+({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
- return DEFISNONLATINWORD;
+ return CYRHYPHENWORD;
}
-([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ {
+([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
- tokenlen = tsearch_yyleng;
s = strdup( tsearch_yytext );
+ tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
- return DEFISLATWORD;
+ return LATHYPHENWORD;
}
-({ALNUM}+-)+{ALPHA}+ /* composite-word */ {
+({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
- return DEFISWORD;
+ return HYPHENWORD;
+}
+
+<DELIM>\+?[0-9]+\.[0-9]+ {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return DECIMAL;
}
-<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ {
+<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return NONLATINPARTWORD;
+ return CYRPARTHYPHENWORD;
}
-<DELIM>[[:alnum:]]+ /* one word in composite-word */ {
+<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return LATPARTWORD;
+ return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return PARTWORD;
+ return PARTHYPHENWORD;
}
<DELIM>- {
@@ -264,17 +274,16 @@ ftp"://" {
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
- tokenlen = tsearch_yyleng;
yyless( 0 );
}
-{NONLATINALNUM}+ /* normal word */ {
+{CYRALPHA}+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
- return NONLATINWORD;
+ return CYRWORD;
}
-[[:alnum:]]+ /* normal word */ {
+[[:alpha:]]+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATWORD;
@@ -286,7 +295,13 @@ ftp"://" {
return UWORD;
}
-.|\n {
+[ \r\n\t]+ {
+ token = tsearch_yytext;
+ tokenlen = tsearch_yyleng;
+ return SPACE;
+}
+
+. {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;