Add some code to MemTranslate() to prevent the READ_UTF8() macro from overreading a buffer. (CVS 4033)

FossilOrigin-Name: 0595319cd716611bb5a12eb952e395283ea749bb
2025-11-09 14:21:03 +03:00 · 2007-05-23 16:23:09 +00:00
parent 17a240a25e
commit 7677c0cc62
5 changed files with 94 additions and 14 deletions
--- a/src/utf.c
+++ b/src/utf.c
@@ -12,7 +12,7 @@
 ** This file contains routines used to translate between UTF-8, 
 ** UTF-16, UTF-16BE, and UTF-16LE.
 **
-** $Id: utf.c,v 1.50 2007/05/16 18:23:05 danielk1977 Exp $
+** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $
 **
 ** Notes on UTF-8:
 **
@@ -219,12 +219,71 @@ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
+    unsigned int iExtra = 0xD800;
+
+    if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){
+      /* This UTF8 string is not nul-terminated, and the last byte is
+      ** not a character in the ascii range (codpoints 0..127). This
+      ** means the SQLITE_READ_UTF8() macro might read past the end
+      ** of the allocated buffer.
+      **
+      ** There are four possibilities:
+      **
+      **   1. The last byte is the first byte of a non-ASCII character,
+      **
+      **   2. The final N bytes of the input string are continuation bytes
+      **      and immediately preceding them is the first byte of a 
+      **      non-ASCII character.
+      **
+      **   3. The final N bytes of the input string are continuation bytes
+      **      and immediately preceding them is a byte that encodes a 
+      **      character in the ASCII range.
+      **
+      **   4. The entire string consists of continuation characters.
+      **
+      ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()
+      ** macro will not overread the buffer in these cases.
+      */
+      unsigned char *zExtra = &zTerm[-1];
+      while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){
+        zExtra--;
+      }
+
+      if( (zExtra[0]&0xC0)==0xC0 ){
+        /* Make a copy of the last character encoding in the input string.
+        ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()
+        ** to decode the codepoint. Store the codepoint in variable iExtra,
+        ** it will be appended to the output string later.
+        */
+        unsigned char *zFree = 0;
+        unsigned char zBuf[16];
+        int nExtra = (pMem->n+zIn-zExtra);
+        zTerm = zExtra;
+        if( nExtra>15 ){
+          zExtra = sqliteMallocRaw(nExtra+1);
+          if( !zExtra ){
+            return SQLITE_NOMEM;
+          }
+          zFree = zExtra;
+        }else{
+          zExtra = zBuf;
+        }
+        memcpy(zExtra, zTerm, nExtra);
+        zExtra[nExtra] = '\0';
+        SQLITE_READ_UTF8(zExtra, iExtra);
+        sqliteFree(zFree);
+      }
+    }
+
    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        SQLITE_READ_UTF8(zIn, c); 
        WRITE_UTF16LE(z, c);
      }
+      if( iExtra!=0xD800 ){
+        WRITE_UTF16LE(z, iExtra);
+      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
@@ -232,6 +291,9 @@ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
        SQLITE_READ_UTF8(zIn, c); 
        WRITE_UTF16BE(z, c);
      }
+      if( iExtra!=0xD800 ){
+        WRITE_UTF16BE(z, iExtra);
+      }
    }
    pMem->n = z - zOut;
    *z++ = 0;