diff options
Diffstat (limited to 'ext/fts5/fts5_index.c')
-rw-r--r-- | ext/fts5/fts5_index.c | 50 |
1 files changed, 45 insertions, 5 deletions
diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c index 015696f7b..eb20af816 100644 --- a/ext/fts5/fts5_index.c +++ b/ext/fts5/fts5_index.c @@ -5318,10 +5318,13 @@ int sqlite3Fts5IndexCharlenToBytelen( for(i=0; i<nChar; i++){ if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ if( (unsigned char)p[n++]>=0xc0 ){ - if( n>=nByte ) break; + if( n>=nByte ) return 0; while( (p[n] & 0xc0)==0x80 ){ n++; - if( n>=nByte ) break; + if( n>=nByte ){ + if( i+1==nChar ) break; + return 0; + } } } } @@ -5723,6 +5726,37 @@ static int fts5QueryCksum( return rc; } +/* +** Check if buffer z[], size n bytes, contains as series of valid utf-8 +** encoded codepoints. If so, return 0. Otherwise, if the buffer does not +** contain valid utf-8, return non-zero. +*/ +static int fts5TestUtf8(const char *z, int n){ + assert_nc( n>0 ); + int i = 0; + while( i<n ){ + if( (z[i] & 0x80)==0x00 ){ + i++; + }else + if( (z[i] & 0xE0)==0xC0 ){ + if( i+1>=n || (z[i+1] & 0xC0)!=0x80 ) return 1; + i += 2; + }else + if( (z[i] & 0xF0)==0xE0 ){ + if( i+2>=n || (z[i+1] & 0xC0)!=0x80 || (z[i+2] & 0xC0)!=0x80 ) return 1; + i += 3; + }else + if( (z[i] & 0xF8)==0xF0 ){ + if( i+3>=n || (z[i+1] & 0xC0)!=0x80 || (z[i+2] & 0xC0)!=0x80 ) return 1; + if( (z[i+2] & 0xC0)!=0x80 ) return 1; + i += 3; + }else{ + return 1; + } + } + + return 0; +} /* ** This function is also purely an internal test. It does not contribute to @@ -5763,8 +5797,14 @@ static void fts5TestTerm( ** This check may only be performed if the hash table is empty. This ** is because the hash table only supports a single scan query at ** a time, and the multi-iter loop from which this function is called - ** is already performing such a scan. */ - if( p->nPendingData==0 ){ + ** is already performing such a scan. + ** + ** Also only do this if buffer zTerm contains nTerm bytes of valid + ** utf-8. Otherwise, the last part of the buffer contents might contain + ** a non-utf-8 sequence that happens to be a prefix of a valid utf-8 + ** character stored in the main fts index, which will cause the + ** test to fail. */ + if( p->nPendingData==0 && 0==fts5TestUtf8(zTerm, nTerm) ){ if( iIdx>0 && rc==SQLITE_OK ){ int f = flags|FTS5INDEX_QUERY_TEST_NOIDX; ck2 = 0; @@ -5897,8 +5937,8 @@ static void fts5IndexIntegrityCheckSegment( i64 iRow; /* Rowid for this leaf */ Fts5Data *pLeaf; /* Data for this leaf */ + const char *zIdxTerm = (const char*)sqlite3_column_blob(pStmt, 1); int nIdxTerm = sqlite3_column_bytes(pStmt, 1); - const char *zIdxTerm = (const char*)sqlite3_column_text(pStmt, 1); int iIdxLeaf = sqlite3_column_int(pStmt, 2); int bIdxDlidx = sqlite3_column_int(pStmt, 3); |