aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/tsearch2/ispell/Makefile4
-rw-r--r--contrib/tsearch2/ispell/regis.c151
-rw-r--r--contrib/tsearch2/ispell/regis.h34
-rw-r--r--contrib/tsearch2/ispell/spell.c187
-rw-r--r--contrib/tsearch2/ispell/spell.h37
-rw-r--r--contrib/tsearch2/tsearch.sql.in2
6 files changed, 339 insertions, 76 deletions
diff --git a/contrib/tsearch2/ispell/Makefile b/contrib/tsearch2/ispell/Makefile
index c620aaa1539..fbbe3f7912f 100644
--- a/contrib/tsearch2/ispell/Makefile
+++ b/contrib/tsearch2/ispell/Makefile
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.5 2003/11/29 19:51:36 pgsql Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.6 2004/06/23 11:06:11 teodor Exp $
subdir = contrib/tsearch2/ispell
top_builddir = ../../..
@@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global
PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS)
override CFLAGS += $(CFLAGS_SL)
-SUBOBJS = spell.o
+SUBOBJS = spell.o regis.o
all: SUBSYS.o
diff --git a/contrib/tsearch2/ispell/regis.c b/contrib/tsearch2/ispell/regis.c
new file mode 100644
index 00000000000..052413788b0
--- /dev/null
+++ b/contrib/tsearch2/ispell/regis.c
@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "regis.h"
+#include "common.h"
+
+int
+RS_isRegis(const char *str) {
+ unsigned char *ptr=(unsigned char *)str;
+
+ while(ptr && *ptr)
+ if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^')
+ ptr++;
+ else
+ return 0;
+ return 1;
+}
+
+#define RS_IN_ONEOF 1
+#define RS_IN_ONEOF_IN 2
+#define RS_IN_NONEOF 3
+#define RS_IN_WAIT 4
+
+static RegisNode*
+newRegisNode(RegisNode *prev, int len) {
+ RegisNode *ptr;
+ ptr = (RegisNode*)malloc(RNHDRSZ+len+1);
+ if (!ptr)
+ ts_error(ERROR, "No memory");
+ memset(ptr,0,RNHDRSZ+len+1);
+ if (prev)
+ prev->next=ptr;
+ return ptr;
+}
+
+int
+RS_compile(Regis *r, int issuffix, const char *str) {
+ int i,len = strlen(str);
+ int state = RS_IN_WAIT;
+ RegisNode *ptr=NULL;
+
+ memset(r,0,sizeof(Regis));
+ r->issuffix = (issuffix) ? 1 : 0;
+
+ for(i=0;i<len;i++) {
+ unsigned char c = *( ( (unsigned char*)str ) + i );
+ if ( state == RS_IN_WAIT ) {
+ if ( isalpha(c) ) {
+ if ( ptr )
+ ptr = newRegisNode(ptr,len);
+ else
+ ptr = r->node = newRegisNode(NULL,len);
+ ptr->data[ 0 ] = c;
+ ptr->type = RSF_ONEOF;
+ ptr->len=1;
+ } else if ( c=='[' ) {
+ if ( ptr )
+ ptr = newRegisNode(ptr,len);
+ else
+ ptr = r->node = newRegisNode(NULL,len);
+ ptr->type = RSF_ONEOF;
+ state=RS_IN_ONEOF;
+ } else
+ ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+ } else if ( state == RS_IN_ONEOF ) {
+ if ( c=='^' ) {
+ ptr->type = RSF_NONEOF;
+ state=RS_IN_NONEOF;
+ } else if ( isalpha(c) ) {
+ ptr->data[ 0 ] = c;
+ ptr->len=1;
+ state=RS_IN_ONEOF_IN;
+ } else
+ ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+ } else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) {
+ if ( isalpha(c) ) {
+ ptr->data[ ptr->len ] = c;
+ ptr->len++;
+ } else if ( c==']' ) {
+ state=RS_IN_WAIT;
+ } else
+ ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
+ } else
+ ts_error(ERROR,"Internal error in RS_compile: %d\n", state);
+ }
+
+ ptr = r->node;
+ while(ptr) {
+ r->nchar++;
+ ptr=ptr->next;
+ }
+
+ return 0;
+}
+
+void
+RS_free(Regis *r) {
+ RegisNode *ptr=r->node,*tmp;
+
+ while(ptr) {
+ tmp=ptr->next;
+ free(ptr);
+ ptr = tmp;
+ }
+
+ r->node = NULL;
+}
+
+int
+RS_execute(Regis *r, const char *str, int len) {
+ RegisNode *ptr=r->node;
+ unsigned char *c;
+
+ if (len<0)
+ len=strlen(str);
+
+ if (len<r->nchar)
+ return 0;
+
+ if ( r->issuffix )
+ c = ((unsigned char*)str) + len - r->nchar;
+ else
+ c = (unsigned char*)str;
+
+ while(ptr) {
+ switch(ptr->type) {
+ case RSF_ONEOF:
+ if ( ptr->len==0 ) {
+ if ( *c != *(ptr->data) )
+ return 0;
+ } else if ( strchr((char*)ptr->data, *c) == NULL )
+ return 0;
+ break;
+ case RSF_NONEOF:
+ if ( ptr->len==0 ) {
+ if ( *c == *(ptr->data) )
+ return 0;
+ } else if ( strchr((char*)ptr->data, *c) != NULL )
+ return 0;
+ break;
+ default:
+ ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type);
+ }
+ ptr=ptr->next;
+ c++;
+ }
+
+ return 1;
+}
diff --git a/contrib/tsearch2/ispell/regis.h b/contrib/tsearch2/ispell/regis.h
new file mode 100644
index 00000000000..64a7e9d996c
--- /dev/null
+++ b/contrib/tsearch2/ispell/regis.h
@@ -0,0 +1,34 @@
+#ifndef __REGIS_H__
+#define __REGIS_H__
+
+#include "postgres.h"
+
+typedef struct RegisNode {
+ uint32
+ type:2,
+ len:16,
+ unused:14;
+ struct RegisNode *next;
+ unsigned char data[1];
+} RegisNode;
+
+#define RNHDRSZ (sizeof(uint32)+sizeof(void*))
+
+#define RSF_ONEOF 1
+#define RSF_NONEOF 2
+
+typedef struct Regis {
+ RegisNode *node;
+ uint32
+ issuffix:1,
+ nchar:16,
+ unused:15;
+} Regis;
+
+int RS_isRegis(const char *str);
+
+int RS_compile(Regis *r, int issuffix, const char *str);
+void RS_free(Regis *r);
+/*возвращает 1 если матчится */
+int RS_execute(Regis *r, const char *str, int len);
+#endif
diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c
index fa624738959..06d712470f1 100644
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
{
SPNode *node = Conf->Dictionary;
SPNodeData *StopLow, *StopHigh, *StopMiddle;
- int level=0, wrdlen=strlen(word);
+ uint8 *ptr =(uint8*)word;
- while( node && level<wrdlen) {
+ while( node && *ptr) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
- StopMiddle = StopLow + (StopHigh - StopLow) / 2;
- if ( StopMiddle->val == ((uint8*)(word))[level] ) {
- if ( wrdlen==level+1 && StopMiddle->isword ) {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if ( StopMiddle->val == *ptr ) {
+ if ( *(ptr+1)=='\0' && StopMiddle->isword ) {
if ( compoundonly && !StopMiddle->compoundallow )
return 0;
if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
return 1;
}
node=StopMiddle->node;
- level++;
+ ptr++;
break;
- } else if ( StopMiddle->val < ((uint8*)(word))[level] ) {
+ } else if ( StopMiddle->val < *ptr ) {
StopLow = StopMiddle + 1;
} else {
StopHigh = StopMiddle;
@@ -236,19 +236,32 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
}
MEMOUT(Conf->Affix);
}
- if (type == 's')
- sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
- else
- sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
- Conf->Affix[Conf->naffixes].compile = 1;
- Conf->Affix[Conf->naffixes].flagflags = flagflags;
- Conf->Affix[Conf->naffixes].flag = flag;
- Conf->Affix[Conf->naffixes].type = type;
-
- strcpy(Conf->Affix[Conf->naffixes].find, find);
- strcpy(Conf->Affix[Conf->naffixes].repl, repl);
- Conf->Affix[Conf->naffixes].replen = strlen(repl);
- Conf->naffixes++;
+
+ if ( strcmp(mask,".")==0 ) {
+ Conf->Affix[Conf->naffixes].issimple=1;
+ Conf->Affix[Conf->naffixes].isregis=0;
+ *( Conf->Affix[Conf->naffixes].mask )='\0';
+ } else if ( RS_isRegis(mask) ) {
+ Conf->Affix[Conf->naffixes].issimple=0;
+ Conf->Affix[Conf->naffixes].isregis=1;
+ strcpy(Conf->Affix[Conf->naffixes].mask, mask);
+ } else {
+ Conf->Affix[Conf->naffixes].issimple=0;
+ Conf->Affix[Conf->naffixes].isregis=0;
+ if (type == FF_SUFFIX)
+ sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
+ else
+ sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
+ }
+ Conf->Affix[Conf->naffixes].compile = 1;
+ Conf->Affix[Conf->naffixes].flagflags = flagflags;
+ Conf->Affix[Conf->naffixes].flag = flag;
+ Conf->Affix[Conf->naffixes].type = type;
+
+ strcpy(Conf->Affix[Conf->naffixes].find, find);
+ strcpy(Conf->Affix[Conf->naffixes].repl, repl);
+ Conf->Affix[Conf->naffixes].replen = strlen(repl);
+ Conf->naffixes++;
return (0);
}
@@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
continue;
}
- NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
+ NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
}
fclose(affix);
@@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) {
return rs;
}
+static void
+mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) {
+ int i,cnt=0;
+ int start = (issuffix) ? startsuffix : 0;
+ int end = (issuffix) ? Conf->naffixes : startsuffix;
+ AffixNode *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData));
+
+ MEMOUT(Affix);
+ memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) );
+ Affix->length=1;
+ Affix->isvoid=1;
+
+ if (issuffix) {
+ Affix->data->node=Conf->Suffix;
+ Conf->Suffix = Affix;
+ } else {
+ Affix->data->node=Conf->Prefix;
+ Conf->Prefix = Affix;
+ }
+
+
+ for(i=start;i<end;i++)
+ if (Conf->Affix[i].replen==0)
+ cnt++;
+
+ if ( cnt==0 )
+ return;
+
+ Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt );
+ MEMOUT(Affix->data->aff);
+ Affix->data->naff = (uint32)cnt;
+
+ cnt=0;
+ for(i=start;i<end;i++)
+ if (Conf->Affix[i].replen==0) {
+ Affix->data->aff[cnt] = Conf->Affix + i;
+ cnt++;
+ }
+}
+
void
NISortAffixes(IspellDict * Conf)
{
@@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf)
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p');
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
+ mkVoidAffix(Conf, 1, firstsuffix);
+ mkVoidAffix(Conf, 0, firstsuffix);
}
static AffixNodeData*
@@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
AffixNodeData *StopLow, *StopHigh, *StopMiddle;
uint8 symbol;
+ if ( node->isvoid ) { /* search void affixes */
+ if (node->data->naff)
+ return node->data;
+ node = node->data->node;
+ }
+
while( node && *level<wrdlen) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
- StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
symbol = GETWCHAR(word,wrdlen,*level,type);
if ( StopMiddle->val == symbol ) {
+ (*level)++;
if ( StopMiddle->naff )
return StopMiddle;
node=StopMiddle->node;
- (*level)++;
break;
} else if ( StopMiddle->val < symbol ) {
StopLow = StopMiddle + 1;
@@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
static char *
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
- regmatch_t subs[2]; /* workaround for apache&linux */
- int err;
- pg_wchar *data;
- size_t data_len;
- int dat_len;
if ( flagflags & FF_COMPOUNDONLYAFX ) {
if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
@@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
return NULL;
}
- if ( Affix->type=='s' ) {
+ if ( Affix->type==FF_SUFFIX ) {
strcpy(newword, word);
strcpy(newword + len - Affix->replen, Affix->find);
} else {
@@ -639,34 +695,50 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
strcat(newword, word + Affix->replen);
}
- if (Affix->compile)
- {
- int wmasklen,masklen = strlen(Affix->mask);
- pg_wchar *mask;
- mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
- wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
-
- err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
- pfree(mask);
- if (err)
+ if ( Affix->issimple ) {
+ return newword;
+ } else if ( Affix->isregis ) {
+ if (Affix->compile) {
+ RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask);
+ Affix->compile = 0;
+ }
+ if ( RS_execute(&(Affix->reg.regis), newword, -1) )
+ return newword;
+ } else {
+ regmatch_t subs[2]; /* workaround for apache&linux */
+ int err;
+ pg_wchar *data;
+ size_t data_len;
+ int dat_len;
+ if (Affix->compile)
{
- /* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
- pg_regfree(&(Affix->reg));
- return (NULL);
+ int wmasklen,masklen = strlen(Affix->mask);
+ pg_wchar *mask;
+ mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
+ wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
+
+ err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+ pfree(mask);
+ if (err)
+ {
+ /* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */
+ pg_regfree(&(Affix->reg.regex));
+ return (NULL);
+ }
+ Affix->compile = 0;
}
- Affix->compile = 0;
- }
- /* Convert data string to wide characters */
- dat_len = strlen(newword);
- data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
- data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+ /* Convert data string to wide characters */
+ dat_len = strlen(newword);
+ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(newword, data, dat_len);
- if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) {
- pfree(data);
- return newword;
+ if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) {
+ pfree(data);
+ return newword;
+ }
+ pfree(data);
}
- pfree(data);
return NULL;
}
@@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
}
}
pnode = prefix->node;
- plevel++;
}
/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
@@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
}
}
pnode = prefix->node;
- plevel++;
}
}
}
snode=suffix->node;
- slevel++;
}
if (cur == forms) {
@@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf)
for (i = 0; i < Conf->naffixes; i++)
{
- if (Affix[i].compile == 0)
- pg_regfree(&(Affix[i].reg));
+ if (Affix[i].compile == 0) {
+ if ( Affix[i].isregis )
+ RS_free(&(Affix[i].reg.regis));
+ else
+ pg_regfree(&(Affix[i].reg.regex));
+ }
}
if (Conf->Spell) {
for (i = 0; i < Conf->nspell; i++)
diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h
index 150e4166e13..1e22a0d1bcd 100644
--- a/contrib/tsearch2/ispell/spell.h
+++ b/contrib/tsearch2/ispell/spell.h
@@ -3,6 +3,7 @@
#include <sys/types.h>
#include "regex/regex.h"
+#include "regis.h"
#include "c.h"
@@ -40,20 +41,29 @@ typedef struct spell_struct
typedef struct aff_struct
{
- char flag;
- char flagflags;
- char type;
- char mask[33];
- char find[16];
- char repl[16];
- regex_t reg;
- size_t replen;
- char compile;
+ uint32
+ flag:8,
+ type:2,
+ compile:1,
+ flagflags:3,
+ issimple:1,
+ isregis:1,
+ unused:1,
+ replen:16;
+ char mask[32];
+ char find[16];
+ char repl[16];
+ union {
+ regex_t regex;
+ Regis regis;
+ } reg;
} AFFIX;
#define FF_CROSSPRODUCT 0x01
#define FF_COMPOUNDWORD 0x02
#define FF_COMPOUNDONLYAFX 0x04
+#define FF_SUFFIX 2
+#define FF_PREFIX 1
struct AffixNode;
@@ -66,18 +76,13 @@ typedef struct {
} AffixNodeData;
typedef struct AffixNode {
- uint32 length;
+ uint32 isvoid:1,
+ length:31;
AffixNodeData data[1];
} AffixNode;
#define ANHRDSZ (sizeof(uint32))
-typedef struct Tree_struct
-{
- int Left[256],
- Right[256];
-} Tree_struct;
-
typedef struct {
char *affix;
int len;
diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in
index 25cb05f59c0..f0b2397d0de 100644
--- a/contrib/tsearch2/tsearch.sql.in
+++ b/contrib/tsearch2/tsearch.sql.in
@@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops
FUNCTION 1 tsvector_cmp(tsvector, tsvector);
--example of ISpell dictionary
---update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_id=4;
+--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
--example of synonym dict
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
END;