Fix a problem with LIKE and GLOB processing in utf-16be databases in cases where the utf-8 encoding of a character ends with the byte 0xBF.

FossilOrigin-Name: 4b4f33d791fe4318c4597bee7d2f9e486ed223e731982af470f5cc0dbdc600fc
author: dan <Dan Kennedy> 2025-01-31 14:52:36 +0000
committer: dan <Dan Kennedy> 2025-01-31 14:52:36 +0000
commit: f6ca35d88cad6521ad06c6c9e1bfe0e3066d8cbb (patch)
tree: 3d9bb8c7ade44432f4beb24748cffbe93dbbb469 /src
parent: a0337903030a123f09eae41a5146993ff81d83e4 (diff)
download: sqlite-f6ca35d88cad6521ad06c6c9e1bfe0e3066d8cbb.tar.gz
sqlite-f6ca35d88cad6521ad06c6c9e1bfe0e3066d8cbb.zip
1 files changed, 20 insertions, 12 deletions
diff --git a/src/whereexpr.c b/src/whereexpr.c
index 2b6eb6a78..0a963f4f9 100644
--- a/src/whereexpr.c
+++ b/src/whereexpr.c
@@ -219,12 +219,12 @@ static int isLikeOrGlob(
      z = (u8*)pRight->u.zToken;
   }
   if( z ){
-    /* Count the number of prefix bytes prior to the first wildcard.
-    ** or U+fffd character.  If the underlying database has a UTF16LE
-    ** encoding, then only consider ASCII characters.  Note that the
-    ** encoding of z[] is UTF8 - we are dealing with only UTF8 here in
-    ** this code, but the database engine itself might be processing
-    ** content using a different encoding. */
+    /* Count the number of prefix bytes prior to the first wildcard,
+    ** U+fffd character, or malformed utf-8. If the underlying database
+    ** has a UTF16LE encoding, then only consider ASCII characters.  Note that
+    ** the encoding of z[] is UTF8 - we are dealing with only UTF8 here in this
+    ** code, but the database engine itself might be processing content using a
+    ** different encoding. */
     cnt = 0;
     while( (c=z[cnt])!=0 && c!=wc[0] && c!=wc[1] && c!=wc[2] ){
       cnt++;
@@ -232,7 +232,9 @@ static int isLikeOrGlob(
         cnt++;
       }else if( c>=0x80 ){
         const u8 *z2 = z+cnt-1;
-        if( sqlite3Utf8Read(&z2)==0xfffd || ENC(db)==SQLITE_UTF16LE ){
+        if( sqlite3Utf8Read(&z2)==0xfffd || c==0xFF   /* bad utf-8 */
+         || ENC(db)==SQLITE_UTF16LE 
+        ){
           cnt--;
           break;
         }else{
@@ -1384,9 +1386,8 @@ static void exprAnalyze(
     }
 
     if( !db->mallocFailed ){
-      u8 c, *pC;       /* Last character before the first wildcard */
+      u8 *pC;       /* Last character before the first wildcard */
       pC = (u8*)&pStr2->u.zToken[sqlite3Strlen30(pStr2->u.zToken)-1];
-      c = *pC;
       if( noCase ){
         /* The point is to increment the last character before the first
         ** wildcard.  But if we increment '@', that will push it into the
@@ -1394,10 +1395,17 @@ static void exprAnalyze(
         ** inequality.  To avoid this, make sure to also run the full
         ** LIKE on all candidate expressions by clearing the isComplete flag
         */
-        if( c=='A'-1 ) isComplete = 0;
-        c = sqlite3UpperToLower[c];
+        if( *pC=='A'-1 ) isComplete = 0;
+        *pC = sqlite3UpperToLower[*pC];
+      }
+
+      /* Increment the value of the last utf8 character in the prefix. */
+      while( *pC==0xBF && pC>(u8*)pStr2->u.zToken ){
+        *pC = 0x80;
+        pC--;
       }
-      *pC = c + 1;
+      assert( *pC!=0xFF );        /* isLikeOrGlob() guarantees this */
+      (*pC)++;
     }
     zCollSeqName = noCase ? "NOCASE" : sqlite3StrBINARY;
     pNewExpr1 = sqlite3ExprDup(db, pLeft, 0);
author	dan <Dan Kennedy>	2025-01-31 14:52:36 +0000
committer	dan <Dan Kennedy>	2025-01-31 14:52:36 +0000
commit	f6ca35d88cad6521ad06c6c9e1bfe0e3066d8cbb (patch)
tree	3d9bb8c7ade44432f4beb24748cffbe93dbbb469 /src
parent	a0337903030a123f09eae41a5146993ff81d83e4 (diff)
download	sqlite-f6ca35d88cad6521ad06c6c9e1bfe0e3066d8cbb.tar.gz sqlite-f6ca35d88cad6521ad06c6c9e1bfe0e3066d8cbb.zip