From f25a6e9f8f36375bce85bfefdf388c770057c27c Mon Sep 17 00:00:00 2001 From: Bimba Shrestha Date: Fri, 10 Jan 2020 14:25:24 -0800 Subject: [PATCH] Adding new cli endpoint --patch-from= (#1940) * Adding new cli endpoint --diff-from= * Appveyor conversion nit * Using bool set trick instead of direct set * Removing --diff-from and only leaving --diff-from=# * Throwing error when both dictFileName vars are set * Clean up syntax * Renaming diff-from to patch-from * Revering comma separated syntax clean up * Updating playtests with patch-from * Uncommenting accidentally commented * Updating remaining docs and var names to be patch-from instead of diff-from * Constifying * Using existing log2 function and removing newly created one * Argument order (moving prefs to end) * Using comma separated syntax * Moving to outside #ifndef --- programs/fileio.c | 71 +++++++++++++++++++++++++++++++--------------- programs/fileio.h | 1 + programs/zstd.1.md | 11 ++++++- programs/zstdcli.c | 29 +++++++++++++------ tests/playTests.sh | 8 ++++++ 5 files changed, 87 insertions(+), 33 deletions(-) diff --git a/programs/fileio.c b/programs/fileio.c index 8fc25d357..e08c7f350 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -77,6 +77,7 @@ #define FNSPACE 30 +#define PATCHFROM_WINDOWSIZE_EXTRA_BYTES 1 KB /*-************************************* * Macros @@ -321,6 +322,7 @@ struct FIO_prefs_s { int nbWorkers; int excludeCompressedFiles; + int patchFromMode; }; @@ -487,6 +489,10 @@ void FIO_setLdmHashRateLog(FIO_prefs_t* const prefs, int ldmHashRateLog) { prefs->ldmHashRateLog = ldmHashRateLog; } +void FIO_setPatchFromMode(FIO_prefs_t* const prefs, int value) +{ + prefs->patchFromMode = value != 0; +} /*-************************************* * Functions @@ -624,7 +630,7 @@ FIO_openDstFile(FIO_prefs_t* const prefs, * @return : loaded size * if fileName==NULL, returns 0 and a NULL pointer */ -static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName) +static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName, FIO_prefs_t* const prefs) { FILE* fileHandle; U64 fileSize; @@ -638,9 +644,12 @@ static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName) if (fileHandle==NULL) EXM_THROW(31, "%s: %s", fileName, strerror(errno)); fileSize = UTIL_getFileSize(fileName); - if (fileSize > DICTSIZE_MAX) { - EXM_THROW(32, "Dictionary file %s is too large (> %u MB)", - fileName, DICTSIZE_MAX >> 20); /* avoid extreme cases */ + { + size_t const dictSizeMax = prefs->patchFromMode ? prefs->memLimit : DICTSIZE_MAX; + if (fileSize > dictSizeMax) { + EXM_THROW(32, "Dictionary file %s is too large (> %u bytes)", + fileName, (unsigned)dictSizeMax); /* avoid extreme cases */ + } } *bufferPtr = malloc((size_t)fileSize); if (*bufferPtr==NULL) EXM_THROW(34, "%s", strerror(errno)); @@ -743,6 +752,20 @@ FIO_createFilename_fromOutDir(const char* path, const char* outDirName, const si return result; } +/* FIO_highbit64() : + * gives position of highest bit. + * note : only works for v > 0 ! + */ +static unsigned FIO_highbit64(unsigned long long v) +{ + unsigned count = 0; + assert(v != 0); + v >>= 1; + while (v) { v >>= 1; count++; } + return count; +} + + #ifndef ZSTD_NOCOMPRESS /* ********************************************************************** @@ -760,8 +783,8 @@ typedef struct { } cRess_t; static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, - const char* dictFileName, int cLevel, - ZSTD_compressionParameters comprParams) { + const char* dictFileName, const size_t maxSrcFileSize, + int cLevel, ZSTD_compressionParameters comprParams) { cRess_t ress; memset(&ress, 0, sizeof(ress)); @@ -779,7 +802,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, /* Advanced parameters, including dictionary */ { void* dictBuffer; - size_t const dictBuffSize = FIO_createDictBuffer(&dictBuffer, dictFileName); /* works with dictFileName==NULL */ + size_t const dictBuffSize = FIO_createDictBuffer(&dictBuffer, dictFileName, prefs); /* works with dictFileName==NULL */ if (dictFileName && (dictBuffer==NULL)) EXM_THROW(32, "allocation error : can't create dictBuffer"); ress.dictFileName = dictFileName; @@ -787,6 +810,10 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, if (prefs->adaptiveMode && !prefs->ldmFlag && !comprParams.windowLog) comprParams.windowLog = ADAPT_WINDOWLOG_DEFAULT; + if (prefs->patchFromMode) { + comprParams.windowLog = FIO_highbit64((unsigned long long)maxSrcFileSize + PATCHFROM_WINDOWSIZE_EXTRA_BYTES); + } + CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, 1) ); /* always enable content size when available (note: supposed to be default) */ CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_dictIDFlag, prefs->dictIDFlag) ); CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_checksumFlag, prefs->checksumFlag) ); @@ -1515,7 +1542,7 @@ int FIO_compressFilename(FIO_prefs_t* const prefs, const char* dstFileName, const char* srcFileName, const char* dictFileName, int compressionLevel, ZSTD_compressionParameters comprParams) { - cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); + cRess_t const ress = FIO_createCResources(prefs, dictFileName, (size_t)UTIL_getFileSize(srcFileName), compressionLevel, comprParams); int const result = FIO_compressFilename_srcFile(prefs, ress, dstFileName, srcFileName, compressionLevel); @@ -1563,6 +1590,15 @@ FIO_determineCompressedName(const char* srcFileName, const char* outDirName, con return dstFileNameBuffer; } +static size_t FIO_getLargestFileSize(const char** inFileNames, unsigned nbFiles) +{ + size_t i, fileSize, maxFileSize = 0; + for (i = 0; i < nbFiles; i++) { + fileSize = (size_t)UTIL_getFileSize(inFileNames[i]); + maxFileSize = fileSize > maxFileSize ? fileSize : maxFileSize; + } + return maxFileSize; +} /* FIO_compressMultipleFilenames() : * compress nbFiles files @@ -1578,7 +1614,9 @@ int FIO_compressMultipleFilenames(FIO_prefs_t* const prefs, ZSTD_compressionParameters comprParams) { int error = 0; - cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); + cRess_t ress = FIO_createCResources(prefs, dictFileName, + FIO_getLargestFileSize(inFileNamesTable, nbFiles), + compressionLevel, comprParams); /* init */ assert(outFileName != NULL || suffix != NULL); @@ -1648,7 +1686,7 @@ static dRess_t FIO_createDResources(FIO_prefs_t* const prefs, const char* dictFi /* dictionary */ { void* dictBuffer; - size_t const dictBufferSize = FIO_createDictBuffer(&dictBuffer, dictFileName); + size_t const dictBufferSize = FIO_createDictBuffer(&dictBuffer, dictFileName, prefs); CHECK( ZSTD_initDStream_usingDict(ress.dctx, dictBuffer, dictBufferSize) ); free(dictBuffer); } @@ -1793,19 +1831,6 @@ static int FIO_passThrough(const FIO_prefs_t* const prefs, return 0; } -/* FIO_highbit64() : - * gives position of highest bit. - * note : only works for v > 0 ! - */ -static unsigned FIO_highbit64(unsigned long long v) -{ - unsigned count = 0; - assert(v != 0); - v >>= 1; - while (v) { v >>= 1; count++; } - return count; -} - /* FIO_zstdErrorHelp() : * detailed error message when requested window size is too large */ static void diff --git a/programs/fileio.h b/programs/fileio.h index a7da089f6..c592542e5 100644 --- a/programs/fileio.h +++ b/programs/fileio.h @@ -94,6 +94,7 @@ void FIO_setLiteralCompressionMode( void FIO_setNoProgress(unsigned noProgress); void FIO_setNotificationLevel(int level); void FIO_setExcludeCompressedFile(FIO_prefs_t* const prefs, int excludeCompressedFiles); +void FIO_setPatchFromMode(FIO_prefs_t* const prefs, int value); /*-************************************* * Single File functions diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 22a76c239..8539deb9e 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -122,11 +122,20 @@ the last one takes effect. Note: If `windowLog` is set to larger than 27, `--long=windowLog` or `--memory=windowSize` needs to be passed to the decompressor. +* `--patch-from=FILE`: + Specify the file to be used as a reference point for zstd's diff engine. + This is effectively dictionary compression with some convenient parameter + selection, namely that windowSize > srcSize. + + Note: cannot use both this and -D together * `-M#`, `--memory=#`: - Set a memory usage limit for decompression. By default, Zstandard uses 128 MB + Set a memory usage limit. By default, Zstandard uses 128 MB for decompression as the maximum amount of memory the decompressor is allowed to use, but you can override this manually if need be in either direction (ie. you can increase or decrease it). + + This is also used during compression when using with --patch-from=. In this case, + this parameter overrides that maximum size allowed for a dictionary. (128 MB). * `-T#`, `--threads=#`: Compress using `#` working threads (default: 1). If `#` is 0, attempt to detect and use the number of physical CPU cores. diff --git a/programs/zstdcli.c b/programs/zstdcli.c index b7de3d8ff..a97554219 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -597,6 +597,7 @@ int main(int const argCount, const char* argv[]) const char* outFileName = NULL; const char* outDirName = NULL; const char* dictFileName = NULL; + const char* patchFromDictFileName = NULL; const char* suffix = ZSTD_EXTENSION; unsigned maxDictSize = g_defaultMaxDictSize; unsigned dictID = 0; @@ -618,7 +619,7 @@ int main(int const argCount, const char* argv[]) /* init */ (void)recursive; (void)cLevelLast; /* not used when ZSTD_NOBENCH set */ - (void)memLimit; /* not used when ZSTD_NODECOMPRESS set */ + (void)memLimit; assert(argCount >= 1); if ((filenames==NULL) || (file_of_names==NULL)) { DISPLAY("zstd: allocation error \n"); exit(1); } programName = lastNameFromPath(programName); @@ -758,6 +759,7 @@ int main(int const argCount, const char* argv[]) if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--output-dir-flat=")) { outDirName = argument; continue; } + if (longCommandWArg(&argument, "--patch-from=")) { patchFromDictFileName = argument; continue; } if (longCommandWArg(&argument, "--long")) { unsigned ldmWindowLog = 0; ldmFlag = 1; @@ -868,7 +870,7 @@ int main(int const argCount, const char* argv[]) /* destination file name */ case 'o': nextArgumentIsOutFileName=1; lastCommand=1; argument++; break; - /* limit decompression memory */ + /* limit memory */ case 'M': argument++; memLimit = readU32FromChar(&argument); @@ -1167,12 +1169,28 @@ int main(int const argCount, const char* argv[]) } } #endif + if (dictFileName != NULL && patchFromDictFileName != NULL) { + DISPLAY("error : can't use -D and --patch-from=# at the same time \n"); + CLEAN_RETURN(1); + } + /* No status message in pipe mode (stdin - stdout) or multi-files mode */ if (!strcmp(filenames->fileNames[0], stdinmark) && outFileName && !strcmp(outFileName,stdoutmark) && (g_displayLevel==2)) g_displayLevel=1; if ((filenames->tableSize > 1) & (g_displayLevel==2)) g_displayLevel=1; /* IO Stream/File */ FIO_setNotificationLevel(g_displayLevel); + FIO_setPatchFromMode(prefs, patchFromDictFileName != NULL); + if (patchFromDictFileName != NULL) { + dictFileName = patchFromDictFileName; + } + if (memLimit == 0) { + if (compressionParams.windowLog == 0) { + memLimit = (U32)1 << g_defaultMaxWindowLog; + } else { + memLimit = (U32)1 << (compressionParams.windowLog & 31); + } } + FIO_setMemLimit(prefs, memLimit); if (operation==zom_compress) { #ifndef ZSTD_NOCOMPRESS FIO_setNbWorkers(prefs, nbWorkers); @@ -1204,13 +1222,6 @@ int main(int const argCount, const char* argv[]) #endif } else { /* decompression or test */ #ifndef ZSTD_NODECOMPRESS - if (memLimit == 0) { - if (compressionParams.windowLog == 0) { - memLimit = (U32)1 << g_defaultMaxWindowLog; - } else { - memLimit = (U32)1 << (compressionParams.windowLog & 31); - } } - FIO_setMemLimit(prefs, memLimit); if (filenames->tableSize == 1 && outFileName) { operationResult = FIO_decompressFilename(prefs, outFileName, filenames->fileNames[0], dictFileName); } else { diff --git a/tests/playTests.sh b/tests/playTests.sh index d0943e1bb..e0ca2da49 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -1202,6 +1202,14 @@ then $ZSTD -f -vv --rsyncable --single-thread tmp && die "--rsyncable must fail with --single-thread" fi +println "\n===> patch-from tests" + +./datagen -g1000 -P50 > tmp_dict +./datagen -g1000 -P10 > tmp_patch +$ZSTD --memory=10000 --patch-from=tmp_dict tmp_patch -o tmp_patch_diff +$ZSTD -d --memory=10000 --patch-from=tmp_dict tmp_patch_diff -o tmp_patch_recon +$DIFF -s tmp_patch_recon tmp_patch +rm -rf tmp_* println "\n===> large files tests "