mirror of
https://github.com/facebook/zstd.git
synced 2025-07-29 11:21:22 +03:00
Merge fastCover into DictBuilder (#1274)
* Minor fix * Run non-optimize FASTCOVER 5 times in benchmark * Merge fastCover into dictBuilder * Fix mixed declaration issue * Add fastcover to symbol.c * Add fastCover.c and cover.h to build * Change fastCover.c to fastcover.c * Update benchmark to run FASTCOVER in dictBuilder * Undo spliting fastcover_param into cover_param and f * Remove convert param functions * Assign f to parameter * Add zdict.h to Makefile in lib * Add cover.h to BUCK * Cast 1 to U64 before shifting * Remove trimming of zero freq head and tail in selectSegment and rebenchmark * Remove f as a separate parameter of tryParam * Read 8 bytes when d is 6 * Add trimming off zero frequency head and tail * Use best functions from COVER and remove trimming part(which leads to worse compression ratio after previous bugs were fixed) * Add finalize= argument to FASTCOVER to specify percentage of training samples passed to ZDICT_finalizeDictionary * Change nbDmer to always read 8 bytes even when d=6 * Add skip=# argument to allow skipping dmers in computeFrequency in FASTCOVER * Update comments and benchmarking result * Change default method of ZDICT_trainFromBuffer to ZDICT_optimizeTrainFromBuffer_fastCover * Add dictType enum and fix bug about passing zParam when converting to coverParam * Combine finalize and skip into a single parameter * Update acceleration parameters and benchmark on 3 sample sets * Change default splitPoint of FASTCOVER to 0.75 and benchmark first 3 sample sets * Initialize variables outside of for loop in benchmark.c * Update benchmark result for hg-manifest * Remove cover.h from install-includes * Add explanation of f * Set default compression level for trainFromBuffer to 3 * Add assertion of fastCoverParams in DiB_trainFromFiles * Add checkTotalCompressedSize function + some minor fixes * Add test for multithreading fastCovr * Initialize segmentFreqs in every FASTCOVER_selectSegment and move mutex_unnlock to end of COVER_best_finish * Free segmentFreqs * Initialize segmentFreqs before calling FASTCOVER_buildDictionary instead of in FASTCOVER_selectSegment * Add FASTCOVER_MEMMULT * Minor fix * Update benchmarking result
This commit is contained in:
committed by
Nick Terrell
parent
1af27a7ed7
commit
9d6ed9def3
@ -84,7 +84,10 @@ static U32 g_ldmMinMatch = 0;
|
||||
static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT;
|
||||
static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;
|
||||
|
||||
#define DEFAULT_SPLITPOINT 1.0
|
||||
|
||||
#define DEFAULT_ACCEL 1
|
||||
|
||||
typedef enum { cover, fastCover, legacy } dictType;
|
||||
|
||||
/*-************************************
|
||||
* Display Macros
|
||||
@ -172,6 +175,7 @@ static int usage_advanced(const char* programName)
|
||||
DISPLAY( "Dictionary builder : \n");
|
||||
DISPLAY( "--train ## : create a dictionary from a training set of files \n");
|
||||
DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
|
||||
DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
|
||||
DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
|
||||
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
|
||||
DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
|
||||
@ -295,6 +299,33 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* parseFastCoverParameters() :
|
||||
* reads fastcover parameters from *stringPtr (e.g. "--train-fastcover=k=48,d=8,f=20,steps=32,accel=2") into *params
|
||||
* @return 1 means that fastcover parameters were correct
|
||||
* @return 0 in case of malformed parameters
|
||||
*/
|
||||
static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_params_t* params)
|
||||
{
|
||||
memset(params, 0, sizeof(*params));
|
||||
for (; ;) {
|
||||
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "f=")) { params->f = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "accel=")) { params->accel = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
|
||||
if (longCommandWArg(&stringPtr, "split=")) {
|
||||
unsigned splitPercentage = readU32FromChar(&stringPtr);
|
||||
params->splitPoint = (double)splitPercentage / 100.0;
|
||||
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (stringPtr[0] != 0) return 0;
|
||||
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* parseLegacyParameters() :
|
||||
* reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity
|
||||
@ -316,7 +347,19 @@ static ZDICT_cover_params_t defaultCoverParams(void)
|
||||
memset(¶ms, 0, sizeof(params));
|
||||
params.d = 8;
|
||||
params.steps = 4;
|
||||
params.splitPoint = DEFAULT_SPLITPOINT;
|
||||
params.splitPoint = 1.0;
|
||||
return params;
|
||||
}
|
||||
|
||||
static ZDICT_fastCover_params_t defaultFastCoverParams(void)
|
||||
{
|
||||
ZDICT_fastCover_params_t params;
|
||||
memset(¶ms, 0, sizeof(params));
|
||||
params.d = 8;
|
||||
params.f = 18;
|
||||
params.steps = 4;
|
||||
params.splitPoint = 0.75; /* different from default splitPoint of cover */
|
||||
params.accel = DEFAULT_ACCEL;
|
||||
return params;
|
||||
}
|
||||
#endif
|
||||
@ -431,7 +474,8 @@ int main(int argCount, const char* argv[])
|
||||
#endif
|
||||
#ifndef ZSTD_NODICT
|
||||
ZDICT_cover_params_t coverParams = defaultCoverParams();
|
||||
int cover = 1;
|
||||
ZDICT_fastCover_params_t fastCoverParams = defaultFastCoverParams();
|
||||
dictType dict = fastCover;
|
||||
#endif
|
||||
#ifndef ZSTD_NOBENCH
|
||||
BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
|
||||
@ -530,18 +574,29 @@ int main(int argCount, const char* argv[])
|
||||
operation = zom_train;
|
||||
if (outFileName == NULL)
|
||||
outFileName = g_defaultDictName;
|
||||
cover = 1;
|
||||
dict = cover;
|
||||
/* Allow optional arguments following an = */
|
||||
if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
|
||||
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
|
||||
else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
|
||||
continue;
|
||||
}
|
||||
if (longCommandWArg(&argument, "--train-fastcover")) {
|
||||
operation = zom_train;
|
||||
if (outFileName == NULL)
|
||||
outFileName = g_defaultDictName;
|
||||
dict = fastCover;
|
||||
/* Allow optional arguments following an = */
|
||||
if (*argument == 0) { memset(&fastCoverParams, 0, sizeof(fastCoverParams)); }
|
||||
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
|
||||
else if (!parseFastCoverParameters(argument, &fastCoverParams)) { CLEAN_RETURN(badusage(programName)); }
|
||||
continue;
|
||||
}
|
||||
if (longCommandWArg(&argument, "--train-legacy")) {
|
||||
operation = zom_train;
|
||||
if (outFileName == NULL)
|
||||
outFileName = g_defaultDictName;
|
||||
cover = 0;
|
||||
dict = legacy;
|
||||
/* Allow optional arguments following an = */
|
||||
if (*argument == 0) { continue; }
|
||||
else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
|
||||
@ -881,17 +936,22 @@ int main(int argCount, const char* argv[])
|
||||
zParams.compressionLevel = dictCLevel;
|
||||
zParams.notificationLevel = g_displayLevel;
|
||||
zParams.dictID = dictID;
|
||||
if (cover) {
|
||||
if (dict == cover) {
|
||||
int const optimize = !coverParams.k || !coverParams.d;
|
||||
coverParams.nbThreads = nbWorkers;
|
||||
coverParams.zParams = zParams;
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, NULL, optimize);
|
||||
} else if (dict == fastCover) {
|
||||
int const optimize = !fastCoverParams.k || !fastCoverParams.d;
|
||||
fastCoverParams.nbThreads = nbWorkers;
|
||||
fastCoverParams.zParams = zParams;
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, NULL, &fastCoverParams, optimize);
|
||||
} else {
|
||||
ZDICT_legacy_params_t dictParams;
|
||||
memset(&dictParams, 0, sizeof(dictParams));
|
||||
dictParams.selectivityLevel = dictSelect;
|
||||
dictParams.zParams = zParams;
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, 0);
|
||||
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, NULL, 0);
|
||||
}
|
||||
#endif
|
||||
goto _end;
|
||||
|
Reference in New Issue
Block a user