mirror of
https://github.com/facebook/zstd.git
synced 2025-08-08 17:22:10 +03:00
Added : ability to manually select the dictionary ID of a newly created dictionary
This commit is contained in:
@@ -101,27 +101,30 @@ const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCo
|
||||
/* ********************************************************
|
||||
* File related operations
|
||||
**********************************************************/
|
||||
static void DiB_loadFiles(void* buffer, size_t bufferSize,
|
||||
size_t* fileSizes,
|
||||
const char** fileNamesTable, unsigned nbFiles)
|
||||
/** DiB_loadFiles() :
|
||||
* @return : nb of files effectively loaded into `buffer` */
|
||||
static unsigned DiB_loadFiles(void* buffer, size_t bufferSize,
|
||||
size_t* fileSizes,
|
||||
const char** fileNamesTable, unsigned nbFiles)
|
||||
{
|
||||
char* buff = (char*)buffer;
|
||||
char* const buff = (char*)buffer;
|
||||
size_t pos = 0;
|
||||
unsigned n;
|
||||
|
||||
for (n=0; n<nbFiles; n++) {
|
||||
size_t readSize;
|
||||
unsigned long long fileSize = UTIL_getFileSize(fileNamesTable[n]);
|
||||
FILE* f = fopen(fileNamesTable[n], "rb");
|
||||
unsigned long long const fs64 = UTIL_getFileSize(fileNamesTable[n]);
|
||||
size_t const fileSize = (size_t)(fs64 > bufferSize-pos ? 0 : fs64);
|
||||
FILE* const f = fopen(fileNamesTable[n], "rb");
|
||||
if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
|
||||
DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[n]);
|
||||
if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */
|
||||
readSize = fread(buff+pos, 1, (size_t)fileSize, f);
|
||||
if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
|
||||
pos += readSize;
|
||||
fileSizes[n] = (size_t)fileSize;
|
||||
{ size_t const readSize = fread(buff+pos, 1, fileSize, f);
|
||||
if (readSize != fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
|
||||
pos += readSize; }
|
||||
fileSizes[n] = fileSize;
|
||||
fclose(f);
|
||||
if (fileSize == 0) break; /* stop there, not enough memory to load all files */
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
@@ -130,7 +133,7 @@ static void DiB_loadFiles(void* buffer, size_t bufferSize,
|
||||
**********************************************************/
|
||||
static size_t DiB_findMaxMem(unsigned long long requiredMem)
|
||||
{
|
||||
size_t step = 8 MB;
|
||||
size_t const step = 8 MB;
|
||||
void* testmem = NULL;
|
||||
|
||||
requiredMem = (((requiredMem >> 23) + 1) << 23);
|
||||
@@ -162,7 +165,7 @@ static void DiB_fillNoise(void* buffer, size_t length)
|
||||
static void DiB_saveDict(const char* dictFileName,
|
||||
const void* buff, size_t buffSize)
|
||||
{
|
||||
FILE* f = fopen(dictFileName, "wb");
|
||||
FILE* const f = fopen(dictFileName, "wb");
|
||||
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
|
||||
|
||||
{ size_t const n = fwrite(buff, 1, buffSize, f);
|
||||
@@ -185,47 +188,44 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
|
||||
ZDICT_params_t parameters);
|
||||
|
||||
|
||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||
const char** fileNamesTable, unsigned nbFiles,
|
||||
ZDICT_params_t params)
|
||||
{
|
||||
void* srcBuffer;
|
||||
size_t benchedSize;
|
||||
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
||||
unsigned long long totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
|
||||
void* dictBuffer = malloc(maxDictSize);
|
||||
size_t dictSize;
|
||||
void* const dictBuffer = malloc(maxDictSize);
|
||||
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
||||
unsigned long long const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
|
||||
size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
|
||||
size_t const benchedSize = MIN (maxMem, (size_t)totalSizeToLoad);
|
||||
void* const srcBuffer = malloc(benchedSize+NOISELENGTH);
|
||||
int result = 0;
|
||||
|
||||
/* Checks */
|
||||
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
||||
|
||||
/* init */
|
||||
g_displayLevel = params.notificationLevel;
|
||||
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
|
||||
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
|
||||
if (benchedSize < totalSizeToLoad)
|
||||
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
|
||||
|
||||
/* Memory allocation & restrictions */
|
||||
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
|
||||
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
||||
|
||||
/* Load input buffer */
|
||||
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||
nbFiles = DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||
|
||||
/* call buffer version */
|
||||
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
|
||||
srcBuffer, fileSizes, nbFiles,
|
||||
params);
|
||||
if (ZDICT_isError(dictSize)) {
|
||||
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
|
||||
result = 1;
|
||||
goto _cleanup;
|
||||
{ size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
|
||||
srcBuffer, fileSizes, nbFiles,
|
||||
params);
|
||||
if (ZDICT_isError(dictSize)) {
|
||||
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
|
||||
result = 1;
|
||||
goto _cleanup;
|
||||
}
|
||||
/* save dict */
|
||||
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
||||
DiB_saveDict(dictFileName, dictBuffer, dictSize);
|
||||
}
|
||||
|
||||
/* save dict */
|
||||
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
||||
DiB_saveDict(dictFileName, dictBuffer, dictSize);
|
||||
|
||||
/* clean up */
|
||||
_cleanup:
|
||||
free(srcBuffer);
|
||||
|
@@ -129,6 +129,8 @@ $ZSTD -d tmp -D tmpDict -of result
|
||||
diff zstdcli.c result
|
||||
$ZSTD --train *.c *.h -o tmpDictC
|
||||
$ZSTD -d tmp -D tmpDictC -of result && die "wrong dictionary not detected!"
|
||||
$ZSTD --train *.c --dictID 1 -o tmpDict1
|
||||
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
|
||||
|
||||
|
||||
$ECHO "\n**** multiple files tests **** "
|
||||
|
@@ -18,11 +18,11 @@
|
||||
.PP
|
||||
.B unzstd
|
||||
is equivalent to
|
||||
.BR "zstd \-d"
|
||||
.BR "zstd \-d"
|
||||
.br
|
||||
.B zstdcat
|
||||
is equivalent to
|
||||
.BR "zstd \-dc"
|
||||
.BR "zstd \-dc"
|
||||
.br
|
||||
|
||||
.SH DESCRIPTION
|
||||
@@ -90,7 +90,15 @@ Typical gains range from ~10% (at 64KB) to x5 better (at <1KB).
|
||||
dictionary saved into `file` (default: dictionary)
|
||||
.TP
|
||||
.B \--maxdict #
|
||||
limit dictionary to specified size (default : 112640)
|
||||
limit dictionary to specified size (default : 112640)
|
||||
.TP
|
||||
.B \--dictID #
|
||||
A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary.
|
||||
By default, zstd will create a 4-bytes random number ID.
|
||||
It's possible to give a precise number instead.
|
||||
Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header,
|
||||
and an ID < 65536 will only need 2 bytes. This compares favorably to 4 bytes default.
|
||||
However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries.
|
||||
.TP
|
||||
.B \-s#
|
||||
dictionary selectivity level (default: 9)
|
||||
|
@@ -143,6 +143,7 @@ static int usage_advanced(const char* programName)
|
||||
DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName);
|
||||
DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
|
||||
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
|
||||
DISPLAY( "--dictID: force dictionary ID to specified value (default: random)\n");
|
||||
#endif
|
||||
#ifndef ZSTD_NOBENCH
|
||||
DISPLAY( "\n");
|
||||
@@ -185,7 +186,8 @@ int main(int argCount, const char** argv)
|
||||
operationResult=0,
|
||||
dictBuild=0,
|
||||
nextArgumentIsOutFileName=0,
|
||||
nextArgumentIsMaxDict=0;
|
||||
nextArgumentIsMaxDict=0,
|
||||
nextArgumentIsDictID=0;
|
||||
unsigned cLevel = 1;
|
||||
unsigned cLevelLast = 1;
|
||||
unsigned recursive = 0;
|
||||
@@ -196,6 +198,7 @@ int main(int argCount, const char** argv)
|
||||
const char* dictFileName = NULL;
|
||||
char* dynNameSpace = NULL;
|
||||
unsigned maxDictSize = g_defaultMaxDictSize;
|
||||
unsigned dictID = 0;
|
||||
unsigned dictCLevel = g_defaultDictCLevel;
|
||||
unsigned dictSelect = g_defaultSelectivityLevel;
|
||||
#ifdef UTIL_HAS_CREATEFILELIST
|
||||
@@ -236,6 +239,7 @@ int main(int argCount, const char** argv)
|
||||
if (!strcmp(argument, "--test")) { decode=1; outFileName=nulmark; FIO_overwriteMode(); continue; }
|
||||
if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; }
|
||||
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
|
||||
if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; continue; }
|
||||
if (!strcmp(argument, "--keep")) { continue; } /* does nothing, since preserving input is default; for gzip/xz compatibility */
|
||||
if (!strcmp(argument, "--ultra")) { FIO_setMaxWLog(0); continue; }
|
||||
if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; }
|
||||
@@ -393,6 +397,14 @@ int main(int argCount, const char** argv)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nextArgumentIsDictID) {
|
||||
nextArgumentIsDictID = 0;
|
||||
dictID = 0;
|
||||
while ((*argument>='0') && (*argument<='9'))
|
||||
dictID = dictID * 10 + (*argument - '0'), argument++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* add filename to list */
|
||||
filenameTable[filenameIdx++] = argument;
|
||||
}
|
||||
@@ -429,6 +441,7 @@ int main(int argCount, const char** argv)
|
||||
dictParams.compressionLevel = dictCLevel;
|
||||
dictParams.selectivityLevel = dictSelect;
|
||||
dictParams.notificationLevel = displayLevel;
|
||||
dictParams.dictID = dictID;
|
||||
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
|
||||
#endif
|
||||
goto _end;
|
||||
|
Reference in New Issue
Block a user