1
0
mirror of https://github.com/sqlite/sqlite.git synced 2025-07-30 19:03:16 +03:00

Add support for "fossil deltas" to RBU and "sqldiff --rbu".

FossilOrigin-Name: e26ef165fe2f7524684af0d269d38475ea8b9489
This commit is contained in:
dan
2015-07-31 19:52:03 +00:00
parent 8ff2587b66
commit a9ca8afef5
6 changed files with 773 additions and 23 deletions

View File

@ -23,6 +23,7 @@
#include <stdarg.h>
#include <ctype.h>
#include <string.h>
#include <assert.h>
#include "sqlite3.h"
/*
@ -754,6 +755,411 @@ static void checkSchemasMatch(const char *zTab){
sqlite3_finalize(pStmt);
}
/**************************************************************************
** The following code is copied from fossil. It is used to generate the
** fossil delta blobs sometimes used in RBU update records.
*/
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned char u8;
/*
** The width of a hash window in bytes. The algorithm only works if this
** is a power of 2.
*/
#define NHASH 16
/*
** The current state of the rolling hash.
**
** z[] holds the values that have been hashed. z[] is a circular buffer.
** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of
** the window.
**
** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted
** sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1.
** (Each index for z[] should be module NHASH, of course. The %NHASH operator
** is omitted in the prior expression for brevity.)
*/
typedef struct hash hash;
struct hash {
u16 a, b; /* Hash values */
u16 i; /* Start of the hash window */
char z[NHASH]; /* The values that have been hashed */
};
/*
** Initialize the rolling hash using the first NHASH characters of z[]
*/
static void hash_init(hash *pHash, const char *z){
u16 a, b, i;
a = b = 0;
for(i=0; i<NHASH; i++){
a += z[i];
b += (NHASH-i)*z[i];
pHash->z[i] = z[i];
}
pHash->a = a & 0xffff;
pHash->b = b & 0xffff;
pHash->i = 0;
}
/*
** Advance the rolling hash by a single character "c"
*/
static void hash_next(hash *pHash, int c){
u16 old = pHash->z[pHash->i];
pHash->z[pHash->i] = c;
pHash->i = (pHash->i+1)&(NHASH-1);
pHash->a = pHash->a - old + c;
pHash->b = pHash->b - NHASH*old + pHash->a;
}
/*
** Return a 32-bit hash value
*/
static u32 hash_32bit(hash *pHash){
return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16);
}
/*
** Write an base-64 integer into the given buffer.
*/
static void putInt(unsigned int v, char **pz){
static const char zDigits[] =
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~";
/* 123456789 123456789 123456789 123456789 123456789 123456789 123 */
int i, j;
char zBuf[20];
if( v==0 ){
*(*pz)++ = '0';
return;
}
for(i=0; v>0; i++, v>>=6){
zBuf[i] = zDigits[v&0x3f];
}
for(j=i-1; j>=0; j--){
*(*pz)++ = zBuf[j];
}
}
/*
** Read bytes from *pz and convert them into a positive integer. When
** finished, leave *pz pointing to the first character past the end of
** the integer. The *pLen parameter holds the length of the string
** in *pz and is decremented once for each character in the integer.
*/
static unsigned int getInt(const char **pz, int *pLen){
static const signed char zValue[] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36,
-1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1,
};
unsigned int v = 0;
int c;
unsigned char *z = (unsigned char*)*pz;
unsigned char *zStart = z;
while( (c = zValue[0x7f&*(z++)])>=0 ){
v = (v<<6) + c;
}
z--;
*pLen -= z - zStart;
*pz = (char*)z;
return v;
}
/*
** Return the number digits in the base-64 representation of a positive integer
*/
static int digit_count(int v){
unsigned int i, x;
for(i=1, x=64; v>=x; i++, x <<= 6){}
return i;
}
/*
** Compute a 32-bit checksum on the N-byte buffer. Return the result.
*/
static unsigned int checksum(const char *zIn, size_t N){
const unsigned char *z = (const unsigned char *)zIn;
unsigned sum0 = 0;
unsigned sum1 = 0;
unsigned sum2 = 0;
unsigned sum3 = 0;
while(N >= 16){
sum0 += ((unsigned)z[0] + z[4] + z[8] + z[12]);
sum1 += ((unsigned)z[1] + z[5] + z[9] + z[13]);
sum2 += ((unsigned)z[2] + z[6] + z[10]+ z[14]);
sum3 += ((unsigned)z[3] + z[7] + z[11]+ z[15]);
z += 16;
N -= 16;
}
while(N >= 4){
sum0 += z[0];
sum1 += z[1];
sum2 += z[2];
sum3 += z[3];
z += 4;
N -= 4;
}
sum3 += (sum2 << 8) + (sum1 << 16) + (sum0 << 24);
switch(N){
case 3: sum3 += (z[2] << 8);
case 2: sum3 += (z[1] << 16);
case 1: sum3 += (z[0] << 24);
default: ;
}
return sum3;
}
/*
** Create a new delta.
**
** The delta is written into a preallocated buffer, zDelta, which
** should be at least 60 bytes longer than the target file, zOut.
** The delta string will be NUL-terminated, but it might also contain
** embedded NUL characters if either the zSrc or zOut files are
** binary. This function returns the length of the delta string
** in bytes, excluding the final NUL terminator character.
**
** Output Format:
**
** The delta begins with a base64 number followed by a newline. This
** number is the number of bytes in the TARGET file. Thus, given a
** delta file z, a program can compute the size of the output file
** simply by reading the first line and decoding the base-64 number
** found there. The delta_output_size() routine does exactly this.
**
** After the initial size number, the delta consists of a series of
** literal text segments and commands to copy from the SOURCE file.
** A copy command looks like this:
**
** NNN@MMM,
**
** where NNN is the number of bytes to be copied and MMM is the offset
** into the source file of the first byte (both base-64). If NNN is 0
** it means copy the rest of the input file. Literal text is like this:
**
** NNN:TTTTT
**
** where NNN is the number of bytes of text (base-64) and TTTTT is the text.
**
** The last term is of the form
**
** NNN;
**
** In this case, NNN is a 32-bit bigendian checksum of the output file
** that can be used to verify that the delta applied correctly. All
** numbers are in base-64.
**
** Pure text files generate a pure text delta. Binary files generate a
** delta that may contain some binary data.
**
** Algorithm:
**
** The encoder first builds a hash table to help it find matching
** patterns in the source file. 16-byte chunks of the source file
** sampled at evenly spaced intervals are used to populate the hash
** table.
**
** Next we begin scanning the target file using a sliding 16-byte
** window. The hash of the 16-byte window in the target is used to
** search for a matching section in the source file. When a match
** is found, a copy command is added to the delta. An effort is
** made to extend the matching section to regions that come before
** and after the 16-byte hash window. A copy command is only issued
** if the result would use less space that just quoting the text
** literally. Literal text is added to the delta for sections that
** do not match or which can not be encoded efficiently using copy
** commands.
*/
static int rbuDeltaCreate(
const char *zSrc, /* The source or pattern file */
unsigned int lenSrc, /* Length of the source file */
const char *zOut, /* The target file */
unsigned int lenOut, /* Length of the target file */
char *zDelta /* Write the delta into this buffer */
){
int i, base;
char *zOrigDelta = zDelta;
hash h;
int nHash; /* Number of hash table entries */
int *landmark; /* Primary hash table */
int *collide; /* Collision chain */
int lastRead = -1; /* Last byte of zSrc read by a COPY command */
/* Add the target file size to the beginning of the delta
*/
putInt(lenOut, &zDelta);
*(zDelta++) = '\n';
/* If the source file is very small, it means that we have no
** chance of ever doing a copy command. Just output a single
** literal segment for the entire target and exit.
*/
if( lenSrc<=NHASH ){
putInt(lenOut, &zDelta);
*(zDelta++) = ':';
memcpy(zDelta, zOut, lenOut);
zDelta += lenOut;
putInt(checksum(zOut, lenOut), &zDelta);
*(zDelta++) = ';';
return zDelta - zOrigDelta;
}
/* Compute the hash table used to locate matching sections in the
** source file.
*/
nHash = lenSrc/NHASH;
collide = sqlite3_malloc( nHash*2*sizeof(int) );
landmark = &collide[nHash];
memset(landmark, -1, nHash*sizeof(int));
memset(collide, -1, nHash*sizeof(int));
for(i=0; i<lenSrc-NHASH; i+=NHASH){
int hv;
hash_init(&h, &zSrc[i]);
hv = hash_32bit(&h) % nHash;
collide[i/NHASH] = landmark[hv];
landmark[hv] = i/NHASH;
}
/* Begin scanning the target file and generating copy commands and
** literal sections of the delta.
*/
base = 0; /* We have already generated everything before zOut[base] */
while( base+NHASH<lenOut ){
int iSrc, iBlock;
unsigned int bestCnt, bestOfst=0, bestLitsz=0;
hash_init(&h, &zOut[base]);
i = 0; /* Trying to match a landmark against zOut[base+i] */
bestCnt = 0;
while( 1 ){
int hv;
int limit = 250;
hv = hash_32bit(&h) % nHash;
iBlock = landmark[hv];
while( iBlock>=0 && (limit--)>0 ){
/*
** The hash window has identified a potential match against
** landmark block iBlock. But we need to investigate further.
**
** Look for a region in zOut that matches zSrc. Anchor the search
** at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to
** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen].
**
** Set cnt equal to the length of the match and set ofst so that
** zSrc[ofst] is the first element of the match. litsz is the number
** of characters between zOut[base] and the beginning of the match.
** sz will be the overhead (in bytes) needed to encode the copy
** command. Only generate copy command if the overhead of the
** copy command is less than the amount of literal text to be copied.
*/
int cnt, ofst, litsz;
int j, k, x, y;
int sz;
/* Beginning at iSrc, match forwards as far as we can. j counts
** the number of characters that match */
iSrc = iBlock*NHASH;
for(j=0, x=iSrc, y=base+i; x<lenSrc && y<lenOut; j++, x++, y++){
if( zSrc[x]!=zOut[y] ) break;
}
j--;
/* Beginning at iSrc-1, match backwards as far as we can. k counts
** the number of characters that match */
for(k=1; k<iSrc && k<=i; k++){
if( zSrc[iSrc-k]!=zOut[base+i-k] ) break;
}
k--;
/* Compute the offset and size of the matching region */
ofst = iSrc-k;
cnt = j+k+1;
litsz = i-k; /* Number of bytes of literal text before the copy */
/* sz will hold the number of bytes needed to encode the "insert"
** command and the copy command, not counting the "insert" text */
sz = digit_count(i-k)+digit_count(cnt)+digit_count(ofst)+3;
if( cnt>=sz && cnt>bestCnt ){
/* Remember this match only if it is the best so far and it
** does not increase the file size */
bestCnt = cnt;
bestOfst = iSrc-k;
bestLitsz = litsz;
}
/* Check the next matching block */
iBlock = collide[iBlock];
}
/* We have a copy command that does not cause the delta to be larger
** than a literal insert. So add the copy command to the delta.
*/
if( bestCnt>0 ){
if( bestLitsz>0 ){
/* Add an insert command before the copy */
putInt(bestLitsz,&zDelta);
*(zDelta++) = ':';
memcpy(zDelta, &zOut[base], bestLitsz);
zDelta += bestLitsz;
base += bestLitsz;
}
base += bestCnt;
putInt(bestCnt, &zDelta);
*(zDelta++) = '@';
putInt(bestOfst, &zDelta);
*(zDelta++) = ',';
if( bestOfst + bestCnt -1 > lastRead ){
lastRead = bestOfst + bestCnt - 1;
}
bestCnt = 0;
break;
}
/* If we reach this point, it means no match is found so far */
if( base+i+NHASH>=lenOut ){
/* We have reached the end of the file and have not found any
** matches. Do an "insert" for everything that does not match */
putInt(lenOut-base, &zDelta);
*(zDelta++) = ':';
memcpy(zDelta, &zOut[base], lenOut-base);
zDelta += lenOut-base;
base = lenOut;
break;
}
/* Advance the hash by one character. Keep looking for a match */
hash_next(&h, zOut[base+i+NHASH]);
i++;
}
}
/* Output a final "insert" record to get all the text at the end of
** the file that does not match anything in the source file.
*/
if( base<lenOut ){
putInt(lenOut-base, &zDelta);
*(zDelta++) = ':';
memcpy(zDelta, &zOut[base], lenOut-base);
zDelta += lenOut-base;
}
/* Output the final checksum record. */
putInt(checksum(zOut, lenOut), &zDelta);
*(zDelta++) = ';';
sqlite3_free(collide);
return zDelta - zOrigDelta;
}
/*
** End of code copied from fossil.
**************************************************************************/
static void strPrintfArray(
Str *pStr, /* String object to append to */
const char *zSep, /* Separator string */
@ -779,7 +1185,8 @@ static void getRbudiffQuery(
/* First the newly inserted rows: **/
strPrintf(pSql, "SELECT ");
strPrintfArray(pSql, ", ", "%s", azCol, -1);
strPrintf(pSql, ", 0"); /* Set ota_control to 0 for an insert */
strPrintf(pSql, ", 0, "); /* Set ota_control to 0 for an insert */
strPrintfArray(pSql, ", ", "NULL", azCol, -1);
strPrintf(pSql, " FROM aux.%Q AS n WHERE NOT EXISTS (\n", zTab);
strPrintf(pSql, " SELECT 1 FROM ", zTab);
strPrintf(pSql, " main.%Q AS o WHERE ", zTab);
@ -793,7 +1200,8 @@ static void getRbudiffQuery(
strPrintf(pSql, ", ");
strPrintfArray(pSql, ", ", "NULL", &azCol[nPK], -1);
}
strPrintf(pSql, ", 1"); /* Set ota_control to 1 for a delete */
strPrintf(pSql, ", 1, "); /* Set ota_control to 1 for a delete */
strPrintfArray(pSql, ", ", "NULL", azCol, -1);
strPrintf(pSql, " FROM main.%Q AS n WHERE NOT EXISTS (\n", zTab);
strPrintf(pSql, " SELECT 1 FROM ", zTab);
strPrintf(pSql, " aux.%Q AS o WHERE ", zTab);
@ -821,7 +1229,12 @@ static void getRbudiffQuery(
strPrintfArray(pSql, " ||\n",
" CASE WHEN n.%s IS o.%s THEN '.' ELSE 'x' END", &azCol[nPK], -1
);
strPrintf(pSql, "\nAS ota_control");
strPrintf(pSql, "\nAS ota_control, ");
strPrintfArray(pSql, ", ", "NULL", azCol, nPK);
strPrintf(pSql, ",\n");
strPrintfArray(pSql, " ,\n",
" CASE WHEN n.%s IS o.%s THEN NULL ELSE o.%s END", &azCol[nPK], -1
);
strPrintf(pSql, "\nFROM main.%Q AS o, aux.%Q AS n\nWHERE ", zTab, zTab);
strPrintfArray(pSql, " AND ", "(n.%Q IS o.%Q)", azCol, nPK);
@ -856,6 +1269,7 @@ static void rbudiff_one_table(const char *zTab, FILE *out){
if( azCol==0 ){
runtimeError("table %s has no usable PK columns", zTab);
}
for(nCol=0; azCol[nCol]; nCol++);
/* Build and output the CREATE TABLE statement for the data_xxx table */
strPrintf(&ct, "CREATE TABLE IF NOT EXISTS 'data_%q'(", zTab);
@ -863,7 +1277,6 @@ static void rbudiff_one_table(const char *zTab, FILE *out){
strPrintfArray(&ct, ", ", "%s", &azCol[bOtaRowid], -1);
strPrintf(&ct, ", rbu_control);");
/* Get the SQL for the query to retrieve data from the two databases */
getRbudiffQuery(zTab, azCol, nPK, bOtaRowid, &sql);
@ -875,18 +1288,68 @@ static void rbudiff_one_table(const char *zTab, FILE *out){
strPrintf(&insert, ", rbu_control) VALUES(");
pStmt = db_prepare("%s", sql.z);
nCol = sqlite3_column_count(pStmt);
while( sqlite3_step(pStmt)==SQLITE_ROW ){
/* If this is the first row output, print out the CREATE TABLE
** statement first. And then set ct.z to NULL so that it is not
** printed again. */
if( ct.z ){
fprintf(out, "%s\n", ct.z);
strFree(&ct);
}
/* Output the first part of the INSERT statement */
fprintf(out, "%s", insert.z);
for(i=0; i<nCol; i++){
if( i>0 ) fprintf(out, ", ");
printQuoted(out, sqlite3_column_value(pStmt, i));
if( sqlite3_column_type(pStmt, nCol)==SQLITE_INTEGER ){
for(i=0; i<=nCol; i++){
if( i>0 ) fprintf(out, ", ");
printQuoted(out, sqlite3_column_value(pStmt, i));
}
}else{
char *zOtaControl;
int nOtaControl = sqlite3_column_bytes(pStmt, nCol);
zOtaControl = (char*)sqlite3_malloc(nOtaControl);
memcpy(zOtaControl, sqlite3_column_text(pStmt, nCol), nOtaControl+1);
for(i=0; i<nCol; i++){
int bDone = 0;
if( i>=nPK
&& sqlite3_column_type(pStmt, i)==SQLITE_BLOB
&& sqlite3_column_type(pStmt, nCol+1+i)==SQLITE_BLOB
){
const char *aSrc = sqlite3_column_blob(pStmt, nCol+1+i);
int nSrc = sqlite3_column_bytes(pStmt, nCol+1+i);
const char *aFinal = sqlite3_column_blob(pStmt, i);
int nFinal = sqlite3_column_bytes(pStmt, i);
char *aDelta;
int nDelta;
aDelta = sqlite3_malloc(nFinal + 60);
nDelta = rbuDeltaCreate(aSrc, nSrc, aFinal, nFinal, aDelta);
if( nDelta<nFinal ){
int j;
fprintf(out, "x'");
for(j=0; j<nDelta; j++) fprintf(out, "%02x", (u8)aDelta[j]);
fprintf(out, "'");
zOtaControl[i-bOtaRowid] = 'f';
bDone = 1;
}
sqlite3_free(aDelta);
}
if( bDone==0 ){
printQuoted(out, sqlite3_column_value(pStmt, i));
}
fprintf(out, ", ");
}
fprintf(out, "'%s'", zOtaControl);
sqlite3_free(zOtaControl);
}
/* And the closing bracket of the insert statement */
fprintf(out, ");\n");
}