diff options
Diffstat (limited to 'contrib/tsearch2/dict_thesaurus.c')
-rw-r--r-- | contrib/tsearch2/dict_thesaurus.c | 921 |
1 files changed, 0 insertions, 921 deletions
diff --git a/contrib/tsearch2/dict_thesaurus.c b/contrib/tsearch2/dict_thesaurus.c deleted file mode 100644 index cb5d9cbb784..00000000000 --- a/contrib/tsearch2/dict_thesaurus.c +++ /dev/null @@ -1,921 +0,0 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.9 2007/07/15 22:57:48 tgl Exp $ */ - -/* - * thesaurus - * Teodor Sigaev <teodor@sigaev.ru> - */ -#include "postgres.h" -#include "executor/spi.h" - -#include <ctype.h> - -#include "dict.h" -#include "common.h" -#include "ts_locale.h" - -/* - * Temporay we use TSLexeme.flags for inner use... - */ -#define DT_USEASIS 0x1000 - -typedef struct LexemeInfo -{ - uint16 idsubst; /* entry's number in DictThesaurus->subst */ - uint16 posinsubst; /* pos info in entry */ - uint16 tnvariant; /* total num lexemes in one variant */ - struct LexemeInfo *nextentry; - struct LexemeInfo *nextvariant; -} LexemeInfo; - -typedef struct -{ - char *lexeme; - LexemeInfo *entries; -} TheLexeme; - -typedef struct -{ - uint16 lastlexeme; /* number lexemes to substitute */ - uint16 reslen; - TSLexeme *res; /* prepared substituted result */ -} TheSubstitute; - -typedef struct -{ - /* subdictionary to normalize lexemes */ - DictInfo subdict; - - /* Array to search lexeme by exact match */ - TheLexeme *wrds; - int nwrds; - int ntwrds; - - /* - * Storage of substituted result, n-th element is for n-th expression - */ - TheSubstitute *subst; - int nsubst; -} DictThesaurus; - -PG_FUNCTION_INFO_V1(thesaurus_init); -Datum thesaurus_init(PG_FUNCTION_ARGS); - -PG_FUNCTION_INFO_V1(thesaurus_lexize); -Datum thesaurus_lexize(PG_FUNCTION_ARGS); - -static void -freeDictThesaurus(DictThesaurus * d) -{ - free(d); -} - -static void -newLexeme(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 posinsubst) -{ - TheLexeme *ptr; - - if (d->nwrds >= d->ntwrds) - { - if (d->ntwrds == 0) - { - d->ntwrds = 16; - d->wrds = (TheLexeme *) malloc(sizeof(TheLexeme) * d->ntwrds); - } - else - { - d->ntwrds *= 2; - d->wrds = (TheLexeme *) realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds); - } - if (!d->wrds) - elog(ERROR, "Out of memory"); - } - - ptr = d->wrds + d->nwrds; - d->nwrds++; - - if ((ptr->lexeme = malloc(e - b + 1)) == NULL) - elog(ERROR, "Out of memory"); - - memcpy(ptr->lexeme, b, e - b); - ptr->lexeme[e - b] = '\0'; - - if ((ptr->entries = (LexemeInfo *) malloc(sizeof(LexemeInfo))) == NULL) - elog(ERROR, "Out of memory"); - - ptr->entries->nextentry = NULL; - ptr->entries->idsubst = idsubst; - ptr->entries->posinsubst = posinsubst; -} - -static void -addWrd(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis) -{ - static int nres = 0; - static int ntres = 0; - TheSubstitute *ptr; - - if (nwrd == 0) - { - nres = ntres = 0; - - if (idsubst >= d->nsubst) - { - if (d->nsubst == 0) - { - d->nsubst = 16; - d->subst = (TheSubstitute *) malloc(sizeof(TheSubstitute) * d->nsubst); - } - else - { - d->nsubst *= 2; - d->subst = (TheSubstitute *) realloc(d->subst, sizeof(TheSubstitute) * d->nsubst); - } - if (!d->subst) - elog(ERROR, "Out of memory"); - } - } - - ptr = d->subst + idsubst; - - ptr->lastlexeme = posinsubst - 1; - - if (nres + 1 >= ntres) - { - if (ntres == 0) - { - ntres = 2; - ptr->res = (TSLexeme *) malloc(sizeof(TSLexeme) * ntres); - } - else - { - ntres *= 2; - ptr->res = (TSLexeme *) realloc(ptr->res, sizeof(TSLexeme) * ntres); - } - - if (!ptr->res) - elog(ERROR, "Out of memory"); - } - - if ((ptr->res[nres].lexeme = malloc(e - b + 1)) == 0) - elog(ERROR, "Out of memory"); - memcpy(ptr->res[nres].lexeme, b, e - b); - ptr->res[nres].lexeme[e - b] = '\0'; - - ptr->res[nres].nvariant = nwrd; - if (useasis) - ptr->res[nres].flags = DT_USEASIS; - else - ptr->res[nres].flags = 0; - - ptr->res[++nres].lexeme = NULL; -} - -#define TR_WAITLEX 1 -#define TR_INLEX 2 -#define TR_WAITSUBS 3 -#define TR_INSUBS 4 - -static void -thesaurusRead(char *filename, DictThesaurus * d) -{ - FILE *fh; - char str[BUFSIZ]; - int lineno = 0; - uint16 idsubst = 0; - bool useasis = false; - - fh = fopen(to_absfilename(filename), "r"); - if (!fh) - elog(ERROR, "Thesaurus: cannot open '%s' file", filename); - - while (fgets(str, sizeof(str), fh)) - { - char *ptr = str; - int state = TR_WAITLEX; - char *beginwrd = NULL; - uint16 posinsubst = 0; - uint16 nwrd = 0; - - lineno++; - - /* is it comment ? */ - while (t_isspace(ptr)) - ptr += pg_mblen(ptr); - if (t_iseq(str, '#') || *str == '\0' || t_iseq(str, '\n') || t_iseq(str, '\r')) - continue; - - pg_verifymbstr(ptr, strlen(ptr), false); - while (*ptr) - { - if (state == TR_WAITLEX) - { - if (t_iseq(ptr, ':')) - { - if (posinsubst == 0) - { - fclose(fh); - elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno); - } - state = TR_WAITSUBS; - } - else if (!t_isspace(ptr)) - { - beginwrd = ptr; - state = TR_INLEX; - } - } - else if (state == TR_INLEX) - { - if (t_iseq(ptr, ':')) - { - newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); - state = TR_WAITSUBS; - } - else if (t_isspace(ptr)) - { - newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); - state = TR_WAITLEX; - } - } - else if (state == TR_WAITSUBS) - { - if (t_iseq(ptr, '*')) - { - useasis = true; - state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); - } - else if (t_iseq(ptr, '\\')) - { - useasis = false; - state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); - } - else if (!t_isspace(ptr)) - { - useasis = false; - beginwrd = ptr; - state = TR_INSUBS; - } - } - else if (state == TR_INSUBS) - { - if (t_isspace(ptr)) - { - if (ptr == beginwrd) - elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno); - addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); - state = TR_WAITSUBS; - } - } - else - elog(ERROR, "Thesaurus: Unknown state: %d", state); - - ptr += pg_mblen(ptr); - } - - if (state == TR_INSUBS) - { - if (ptr == beginwrd) - elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno); - addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis); - } - - idsubst++; - - if (!(nwrd && posinsubst)) - { - fclose(fh); - elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno); - } - - } - - d->nsubst = idsubst; - - fclose(fh); -} - -static TheLexeme * -addCompiledLexeme(TheLexeme * newwrds, int *nnw, int *tnm, TSLexeme * lexeme, LexemeInfo * src, uint16 tnvariant) -{ - - if (*nnw >= *tnm) - { - *tnm *= 2; - newwrds = (TheLexeme *) realloc(newwrds, sizeof(TheLexeme) * *tnm); - if (!newwrds) - elog(ERROR, "Out of memory"); - } - - newwrds[*nnw].entries = (LexemeInfo *) malloc(sizeof(LexemeInfo)); - if (!newwrds[*nnw].entries) - elog(ERROR, "Out of memory"); - - if (lexeme && lexeme->lexeme) - { - newwrds[*nnw].lexeme = strdup(lexeme->lexeme); - if (!newwrds[*nnw].lexeme) - elog(ERROR, "Out of memory"); - - newwrds[*nnw].entries->tnvariant = tnvariant; - } - else - { - newwrds[*nnw].lexeme = NULL; - newwrds[*nnw].entries->tnvariant = 1; - } - - newwrds[*nnw].entries->idsubst = src->idsubst; - newwrds[*nnw].entries->posinsubst = src->posinsubst; - - newwrds[*nnw].entries->nextentry = NULL; - - (*nnw)++; - return newwrds; -} - -static int -cmpLexemeInfo(LexemeInfo * a, LexemeInfo * b) -{ - if (a == NULL || b == NULL) - return 0; - - if (a->idsubst == b->idsubst) - { - if (a->posinsubst == b->posinsubst) - { - if (a->tnvariant == b->tnvariant) - return 0; - - return (a->tnvariant > b->tnvariant) ? 1 : -1; - } - - return (a->posinsubst > b->posinsubst) ? 1 : -1; - } - - return (a->idsubst > b->idsubst) ? 1 : -1; -} - -static int -cmpLexeme(TheLexeme * a, TheLexeme * b) -{ - if (a->lexeme == NULL) - { - if (b->lexeme == NULL) - return 0; - else - return 1; - } - else if (b->lexeme == NULL) - return -1; - - return strcmp(a->lexeme, b->lexeme); -} - -static int -cmpLexemeQ(const void *a, const void *b) -{ - return cmpLexeme((TheLexeme *) a, (TheLexeme *) b); -} - -static int -cmpTheLexeme(const void *a, const void *b) -{ - TheLexeme *la = (TheLexeme *) a; - TheLexeme *lb = (TheLexeme *) b; - int res; - - if ((res = cmpLexeme(la, lb)) != 0) - return res; - - return -cmpLexemeInfo(la->entries, lb->entries); -} - -static void -compileTheLexeme(DictThesaurus * d) -{ - int i, - nnw = 0, - tnm = 16; - TheLexeme *newwrds = (TheLexeme *) malloc(sizeof(TheLexeme) * tnm), - *ptrwrds; - - if (!newwrds) - elog(ERROR, "Out of memory"); - - for (i = 0; i < d->nwrds; i++) - { - TSLexeme *ptr; - - ptr = (TSLexeme *) DatumGetPointer( - FunctionCall4( - &(d->subdict.lexize_info), - PointerGetDatum(d->subdict.dictionary), - PointerGetDatum(d->wrds[i].lexeme), - Int32GetDatum(strlen(d->wrds[i].lexeme)), - PointerGetDatum(NULL) - ) - ); - - if (!(ptr && ptr->lexeme)) - { - if (!ptr) - elog(ERROR, "Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)", - d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); - else - elog(NOTICE, "Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)", - d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); - - newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); - } - else - { - while (ptr->lexeme) - { - TSLexeme *remptr = ptr + 1; - int tnvar = 1; - int curvar = ptr->nvariant; - - /* compute n words in one variant */ - while (remptr->lexeme) - { - if (remptr->nvariant != (remptr - 1)->nvariant) - break; - tnvar++; - remptr++; - } - - remptr = ptr; - while (remptr->lexeme && remptr->nvariant == curvar) - { - newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); - remptr++; - } - - ptr = remptr; - } - } - - free(d->wrds[i].lexeme); - free(d->wrds[i].entries); - } - - free(d->wrds); - d->wrds = newwrds; - d->nwrds = nnw; - d->ntwrds = tnm; - - if (d->nwrds > 1) - { - qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme); - - /* uniq */ - newwrds = d->wrds; - ptrwrds = d->wrds + 1; - while (ptrwrds - d->wrds < d->nwrds) - { - if (cmpLexeme(ptrwrds, newwrds) == 0) - { - if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries)) - { - ptrwrds->entries->nextentry = newwrds->entries; - newwrds->entries = ptrwrds->entries; - } - else - free(ptrwrds->entries); - - if (ptrwrds->lexeme) - free(ptrwrds->lexeme); - } - else - { - newwrds++; - *newwrds = *ptrwrds; - } - - ptrwrds++; - } - - d->nwrds = newwrds - d->wrds + 1; - d->wrds = (TheLexeme *) realloc(d->wrds, sizeof(TheLexeme) * d->nwrds); - } -} - -static void -compileTheSubstitute(DictThesaurus * d) -{ - int i; - - for (i = 0; i < d->nsubst; i++) - { - TSLexeme *rem = d->subst[i].res, - *outptr, - *inptr; - int n = 2; - - outptr = d->subst[i].res = (TSLexeme *) malloc(sizeof(TSLexeme) * n); - if (d->subst[i].res == NULL) - elog(ERROR, "Out of Memory"); - outptr->lexeme = NULL; - inptr = rem; - - while (inptr && inptr->lexeme) - { - TSLexeme *lexized, - tmplex[2]; - - if (inptr->flags & DT_USEASIS) - { /* do not lexize */ - tmplex[0] = *inptr; - tmplex[0].flags = 0; - tmplex[1].lexeme = NULL; - lexized = tmplex; - } - else - { - lexized = (TSLexeme *) DatumGetPointer( - FunctionCall4( - &(d->subdict.lexize_info), - PointerGetDatum(d->subdict.dictionary), - PointerGetDatum(inptr->lexeme), - Int32GetDatum(strlen(inptr->lexeme)), - PointerGetDatum(NULL) - ) - ); - } - - if (lexized && lexized->lexeme) - { - int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1; - - while (lexized->lexeme) - { - if (outptr - d->subst[i].res + 1 >= n) - { - int diff = outptr - d->subst[i].res; - - n *= 2; - d->subst[i].res = (TSLexeme *) realloc(d->subst[i].res, sizeof(TSLexeme) * n); - if (d->subst[i].res == NULL) - elog(ERROR, "Out of Memory"); - outptr = d->subst[i].res + diff; - } - - *outptr = *lexized; - if ((outptr->lexeme = strdup(lexized->lexeme)) == NULL) - elog(ERROR, "Out of Memory"); - - outptr++; - lexized++; - } - - if (toset > 0) - d->subst[i].res[toset].flags |= TSL_ADDPOS; - } - else if (lexized) - { - elog(NOTICE, "Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i + 1); - } - else - { - elog(ERROR, "Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i + 1); - } - - if (inptr->lexeme) - free(inptr->lexeme); - inptr++; - } - - if (outptr == d->subst[i].res) - elog(ERROR, "Thesaurus: all words in subsitution are stop word (rule %d)", i + 1); - - d->subst[i].reslen = outptr - d->subst[i].res; - - free(rem); - } -} - -Datum -thesaurus_init(PG_FUNCTION_ARGS) -{ - DictThesaurus *d; - Map *cfg, - *pcfg; - text *in, - *subdictname = NULL; - bool fileloaded = false; - - if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) - ereport(ERROR, - (errcode(ERRCODE_CONFIG_FILE_ERROR), - errmsg("Thesaurus confguration error"))); - - d = (DictThesaurus *) malloc(sizeof(DictThesaurus)); - if (!d) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - memset(d, 0, sizeof(DictThesaurus)); - - in = PG_GETARG_TEXT_P(0); - parse_cfgdict(in, &cfg); - PG_FREE_IF_COPY(in, 0); - pcfg = cfg; - while (pcfg->key) - { - if (pg_strcasecmp("DictFile", pcfg->key) == 0) - { - if (fileloaded) - { - freeDictThesaurus(d); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("Thesaurus file is already loaded"))); - } - fileloaded = true; - thesaurusRead(pcfg->value, d); - } - else if (pg_strcasecmp("Dictionary", pcfg->key) == 0) - { - if (subdictname) - { - freeDictThesaurus(d); - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("Thesaurus: SubDictionary is already defined"))); - } - subdictname = char2text(pcfg->value); - } - else - { - freeDictThesaurus(d); - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("unrecognized option: %s => %s", - pcfg->key, pcfg->value))); - } - pfree(pcfg->key); - pfree(pcfg->value); - pcfg++; - } - pfree(cfg); - - if (!fileloaded) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("Thesaurus file isn't defined"))); - - if (subdictname) - { - DictInfo *subdictptr; - - /* - * we already in SPI, but name2id_dict()/finddict() invoke - * SPI_connect() - */ - SPI_push(); - - subdictptr = finddict(name2id_dict(subdictname)); - - SPI_pop(); - - d->subdict = *subdictptr; - } - else - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("Thesaurus: SubDictionary isn't defined"))); - - compileTheLexeme(d); - compileTheSubstitute(d); - - PG_RETURN_POINTER(d); -} - -static LexemeInfo * -findTheLexeme(DictThesaurus * d, char *lexeme) -{ - TheLexeme key, *res; - - if (d->nwrds == 0) - return NULL; - - key.lexeme = lexeme; - key.entries = NULL; - - res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ); - - if (res == NULL) - return NULL; - return res->entries; -} - -static bool -matchIdSubst(LexemeInfo * stored, uint16 idsubst) -{ - bool res = true; - - if (stored) - { - res = false; - - for (; stored; stored = stored->nextvariant) - if (stored->idsubst == idsubst) - { - res = true; - break; - } - } - - return res; -} - -static LexemeInfo * -findVariant(LexemeInfo * in, LexemeInfo * stored, uint16 curpos, LexemeInfo ** newin, int newn) -{ - for (;;) - { - int i; - LexemeInfo *ptr = newin[0]; - - for (i = 0; i < newn; i++) - { - while (newin[i] && newin[i]->idsubst < ptr->idsubst) - newin[i] = newin[i]->nextentry; - - if (newin[i] == NULL) - return in; - - if (newin[i]->idsubst > ptr->idsubst) - { - ptr = newin[i]; - i = -1; - continue; - } - - while (newin[i]->idsubst == ptr->idsubst) - { - if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn) - { - ptr = newin[i]; - break; - } - - newin[i] = newin[i]->nextentry; - if (newin[i] == NULL) - return in; - } - - if (newin[i]->idsubst != ptr->idsubst) - { - ptr = newin[i]; - i = -1; - continue; - } - } - - if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst))) - { /* found */ - - ptr->nextvariant = in; - in = ptr; - } - - /* step forward */ - for (i = 0; i < newn; i++) - newin[i] = newin[i]->nextentry; - } - - return NULL; -} - -static TSLexeme * -copyTSLexeme(TheSubstitute * ts) -{ - TSLexeme *res; - uint16 i; - - res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1)); - for (i = 0; i < ts->reslen; i++) - { - res[i] = ts->res[i]; - res[i].lexeme = pstrdup(ts->res[i].lexeme); - } - - res[ts->reslen].lexeme = NULL; - - return res; -} - -static TSLexeme * -checkMatch(DictThesaurus * d, LexemeInfo * info, uint16 curpos, bool *moreres) -{ - *moreres = false; - while (info) - { - Assert(info->idsubst < d->nsubst); - if (info->nextvariant) - *moreres = true; - if (d->subst[info->idsubst].lastlexeme == curpos) - return copyTSLexeme(d->subst + info->idsubst); - info = info->nextvariant; - } - - return NULL; -} - -Datum -thesaurus_lexize(PG_FUNCTION_ARGS) -{ - DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0); - DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3); - TSLexeme *res = NULL; - LexemeInfo *stored, - *info = NULL; - uint16 curpos = 0; - bool moreres = false; - - if (dstate == NULL || PG_NARGS() < 4) - elog(ERROR, "Forbidden call of thesaurus or nested call"); - - if (dstate->isend) - PG_RETURN_POINTER(NULL); - stored = (LexemeInfo *) dstate->private; - - if (stored) - curpos = stored->posinsubst + 1; - - res = (TSLexeme *) DatumGetPointer( - FunctionCall4( - &(d->subdict.lexize_info), - PointerGetDatum(d->subdict.dictionary), - PG_GETARG_DATUM(1), - PG_GETARG_INT32(2), - PointerGetDatum(NULL) - ) - ); - - if (res && res->lexeme) - { - TSLexeme *ptr = res, - *basevar; - - while (ptr->lexeme) - { - uint16 nv = ptr->nvariant; - uint16 i, - nlex = 0; - LexemeInfo **infos; - - basevar = ptr; - while (ptr->lexeme && nv == ptr->nvariant) - { - nlex++; - ptr++; - } - - infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex); - for (i = 0; i < nlex; i++) - if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL) - break; - - if (i < nlex) - { - /* no chance to find */ - pfree(infos); - continue; - } - - info = findVariant(info, stored, curpos, infos, nlex); - } - } - else if (res) - { /* stop-word */ - LexemeInfo *infos = findTheLexeme(d, NULL); - - info = findVariant(NULL, stored, curpos, &infos, 1); - } - else - { - info = NULL; /* word isn't recognized */ - } - - dstate->private = (void *) info; - - if (!info) - { - dstate->getnext = false; - PG_RETURN_POINTER(NULL); - } - - if ((res = checkMatch(d, info, curpos, &moreres)) != NULL) - { - dstate->getnext = moreres; - PG_RETURN_POINTER(res); - } - - dstate->getnext = true; - - PG_RETURN_POINTER(NULL); -} |