diff options
author | drh <drh@noemail.net> | 2019-12-31 14:49:10 +0000 |
---|---|---|
committer | drh <drh@noemail.net> | 2019-12-31 14:49:10 +0000 |
commit | 5f6a2ed7f275a201a8ce9fd81b60a2a552c20f6b (patch) | |
tree | 0a8359d1c5b0829d9a6292e85665979bf364e076 /ext/fts5/fts5_index.c | |
parent | d335bc40a6576d8759466d8efaaaf82b04667aaa (diff) | |
parent | 02ff747bc0a6039cddf6108719426d247026fa18 (diff) | |
download | sqlite-5f6a2ed7f275a201a8ce9fd81b60a2a552c20f6b.tar.gz sqlite-5f6a2ed7f275a201a8ce9fd81b60a2a552c20f6b.zip |
Merge recent enhancements from trunk.
FossilOrigin-Name: 39d55579376906f212271ce9b2d367e3ad029fb173f22c7253312b467970208a
Diffstat (limited to 'ext/fts5/fts5_index.c')
-rw-r--r-- | ext/fts5/fts5_index.c | 50 |
1 files changed, 45 insertions, 5 deletions
diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c index 015696f7b..eb20af816 100644 --- a/ext/fts5/fts5_index.c +++ b/ext/fts5/fts5_index.c @@ -5318,10 +5318,13 @@ int sqlite3Fts5IndexCharlenToBytelen( for(i=0; i<nChar; i++){ if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ if( (unsigned char)p[n++]>=0xc0 ){ - if( n>=nByte ) break; + if( n>=nByte ) return 0; while( (p[n] & 0xc0)==0x80 ){ n++; - if( n>=nByte ) break; + if( n>=nByte ){ + if( i+1==nChar ) break; + return 0; + } } } } @@ -5723,6 +5726,37 @@ static int fts5QueryCksum( return rc; } +/* +** Check if buffer z[], size n bytes, contains as series of valid utf-8 +** encoded codepoints. If so, return 0. Otherwise, if the buffer does not +** contain valid utf-8, return non-zero. +*/ +static int fts5TestUtf8(const char *z, int n){ + assert_nc( n>0 ); + int i = 0; + while( i<n ){ + if( (z[i] & 0x80)==0x00 ){ + i++; + }else + if( (z[i] & 0xE0)==0xC0 ){ + if( i+1>=n || (z[i+1] & 0xC0)!=0x80 ) return 1; + i += 2; + }else + if( (z[i] & 0xF0)==0xE0 ){ + if( i+2>=n || (z[i+1] & 0xC0)!=0x80 || (z[i+2] & 0xC0)!=0x80 ) return 1; + i += 3; + }else + if( (z[i] & 0xF8)==0xF0 ){ + if( i+3>=n || (z[i+1] & 0xC0)!=0x80 || (z[i+2] & 0xC0)!=0x80 ) return 1; + if( (z[i+2] & 0xC0)!=0x80 ) return 1; + i += 3; + }else{ + return 1; + } + } + + return 0; +} /* ** This function is also purely an internal test. It does not contribute to @@ -5763,8 +5797,14 @@ static void fts5TestTerm( ** This check may only be performed if the hash table is empty. This ** is because the hash table only supports a single scan query at ** a time, and the multi-iter loop from which this function is called - ** is already performing such a scan. */ - if( p->nPendingData==0 ){ + ** is already performing such a scan. + ** + ** Also only do this if buffer zTerm contains nTerm bytes of valid + ** utf-8. Otherwise, the last part of the buffer contents might contain + ** a non-utf-8 sequence that happens to be a prefix of a valid utf-8 + ** character stored in the main fts index, which will cause the + ** test to fail. */ + if( p->nPendingData==0 && 0==fts5TestUtf8(zTerm, nTerm) ){ if( iIdx>0 && rc==SQLITE_OK ){ int f = flags|FTS5INDEX_QUERY_TEST_NOIDX; ck2 = 0; @@ -5897,8 +5937,8 @@ static void fts5IndexIntegrityCheckSegment( i64 iRow; /* Rowid for this leaf */ Fts5Data *pLeaf; /* Data for this leaf */ + const char *zIdxTerm = (const char*)sqlite3_column_blob(pStmt, 1); int nIdxTerm = sqlite3_column_bytes(pStmt, 1); - const char *zIdxTerm = (const char*)sqlite3_column_text(pStmt, 1); int iIdxLeaf = sqlite3_column_int(pStmt, 2); int bIdxDlidx = sqlite3_column_int(pStmt, 3); |