aboutsummaryrefslogtreecommitdiff
path: root/ext/fts5/fts5_index.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/fts5/fts5_index.c')
-rw-r--r--ext/fts5/fts5_index.c50
1 files changed, 45 insertions, 5 deletions
diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c
index 015696f7b..eb20af816 100644
--- a/ext/fts5/fts5_index.c
+++ b/ext/fts5/fts5_index.c
@@ -5318,10 +5318,13 @@ int sqlite3Fts5IndexCharlenToBytelen(
for(i=0; i<nChar; i++){
if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */
if( (unsigned char)p[n++]>=0xc0 ){
- if( n>=nByte ) break;
+ if( n>=nByte ) return 0;
while( (p[n] & 0xc0)==0x80 ){
n++;
- if( n>=nByte ) break;
+ if( n>=nByte ){
+ if( i+1==nChar ) break;
+ return 0;
+ }
}
}
}
@@ -5723,6 +5726,37 @@ static int fts5QueryCksum(
return rc;
}
+/*
+** Check if buffer z[], size n bytes, contains as series of valid utf-8
+** encoded codepoints. If so, return 0. Otherwise, if the buffer does not
+** contain valid utf-8, return non-zero.
+*/
+static int fts5TestUtf8(const char *z, int n){
+ assert_nc( n>0 );
+ int i = 0;
+ while( i<n ){
+ if( (z[i] & 0x80)==0x00 ){
+ i++;
+ }else
+ if( (z[i] & 0xE0)==0xC0 ){
+ if( i+1>=n || (z[i+1] & 0xC0)!=0x80 ) return 1;
+ i += 2;
+ }else
+ if( (z[i] & 0xF0)==0xE0 ){
+ if( i+2>=n || (z[i+1] & 0xC0)!=0x80 || (z[i+2] & 0xC0)!=0x80 ) return 1;
+ i += 3;
+ }else
+ if( (z[i] & 0xF8)==0xF0 ){
+ if( i+3>=n || (z[i+1] & 0xC0)!=0x80 || (z[i+2] & 0xC0)!=0x80 ) return 1;
+ if( (z[i+2] & 0xC0)!=0x80 ) return 1;
+ i += 3;
+ }else{
+ return 1;
+ }
+ }
+
+ return 0;
+}
/*
** This function is also purely an internal test. It does not contribute to
@@ -5763,8 +5797,14 @@ static void fts5TestTerm(
** This check may only be performed if the hash table is empty. This
** is because the hash table only supports a single scan query at
** a time, and the multi-iter loop from which this function is called
- ** is already performing such a scan. */
- if( p->nPendingData==0 ){
+ ** is already performing such a scan.
+ **
+ ** Also only do this if buffer zTerm contains nTerm bytes of valid
+ ** utf-8. Otherwise, the last part of the buffer contents might contain
+ ** a non-utf-8 sequence that happens to be a prefix of a valid utf-8
+ ** character stored in the main fts index, which will cause the
+ ** test to fail. */
+ if( p->nPendingData==0 && 0==fts5TestUtf8(zTerm, nTerm) ){
if( iIdx>0 && rc==SQLITE_OK ){
int f = flags|FTS5INDEX_QUERY_TEST_NOIDX;
ck2 = 0;
@@ -5897,8 +5937,8 @@ static void fts5IndexIntegrityCheckSegment(
i64 iRow; /* Rowid for this leaf */
Fts5Data *pLeaf; /* Data for this leaf */
+ const char *zIdxTerm = (const char*)sqlite3_column_blob(pStmt, 1);
int nIdxTerm = sqlite3_column_bytes(pStmt, 1);
- const char *zIdxTerm = (const char*)sqlite3_column_text(pStmt, 1);
int iIdxLeaf = sqlite3_column_int(pStmt, 2);
int bIdxDlidx = sqlite3_column_int(pStmt, 3);