mirror of
https://github.com/facebook/zstd.git
synced 2025-09-01 04:42:03 +03:00
removed fastscan mode
This commit is contained in:
@@ -489,7 +489,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
|
|||||||
static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
||||||
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */
|
||||||
const size_t* fileSizes, unsigned nbFiles,
|
const size_t* fileSizes, unsigned nbFiles,
|
||||||
U32 shiftRatio, unsigned maxDictSize)
|
U32 shiftRatio)
|
||||||
{
|
{
|
||||||
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
||||||
int* const suffix = suffix0+1;
|
int* const suffix = suffix0+1;
|
||||||
@@ -542,16 +542,6 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|||||||
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
||||||
} }
|
} }
|
||||||
|
|
||||||
/* limit dictionary size */
|
|
||||||
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
|
||||||
U32 currentSize = 0;
|
|
||||||
U32 n; for (n=1; n<max; n++) {
|
|
||||||
currentSize += dictList[n].length;
|
|
||||||
if (currentSize > maxDictSize) break;
|
|
||||||
}
|
|
||||||
dictList->pos = n;
|
|
||||||
}
|
|
||||||
|
|
||||||
_cleanup:
|
_cleanup:
|
||||||
free(suffix0);
|
free(suffix0);
|
||||||
free(reverseSuffix);
|
free(reverseSuffix);
|
||||||
@@ -845,45 +835,6 @@ _cleanup:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define DIB_FASTSEGMENTSIZE 64
|
|
||||||
/*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
|
|
||||||
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
|
|
||||||
up to `dictSize`.
|
|
||||||
Filling starts from the end of `dictBuffer`, down to maximum possible.
|
|
||||||
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
|
|
||||||
@return : amount of data written into `dictBuffer`,
|
|
||||||
or an error code
|
|
||||||
*/
|
|
||||||
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
|
|
||||||
const void* samplesBuffer, size_t samplesSize)
|
|
||||||
{
|
|
||||||
char* dstPtr = (char*)dictBuffer + dictSize;
|
|
||||||
const char* srcPtr = (const char*)samplesBuffer;
|
|
||||||
size_t const nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
|
|
||||||
size_t segNb, interSize;
|
|
||||||
|
|
||||||
if (nbSegments <= 2) return ERROR(srcSize_wrong);
|
|
||||||
if (samplesSize < dictSize) return ERROR(srcSize_wrong);
|
|
||||||
|
|
||||||
/* first and last segments are part of dictionary, in case they contain interesting header/footer */
|
|
||||||
dstPtr -= DIB_FASTSEGMENTSIZE;
|
|
||||||
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
|
||||||
dstPtr -= DIB_FASTSEGMENTSIZE;
|
|
||||||
memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
|
|
||||||
|
|
||||||
/* regularly copy a segment */
|
|
||||||
interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
|
|
||||||
srcPtr += DIB_FASTSEGMENTSIZE;
|
|
||||||
for (segNb=2; segNb < nbSegments; segNb++) {
|
|
||||||
srcPtr += interSize;
|
|
||||||
dstPtr -= DIB_FASTSEGMENTSIZE;
|
|
||||||
memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
|
|
||||||
srcPtr += DIB_FASTSEGMENTSIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
return nbSegments * DIB_FASTSEGMENTSIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
|
||||||
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
||||||
ZDICT_params_t params)
|
ZDICT_params_t params)
|
||||||
@@ -914,7 +865,7 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3)
|
#define DIB_MINSAMPLESSIZE 512
|
||||||
/*! ZDICT_trainFromBuffer_unsafe() :
|
/*! ZDICT_trainFromBuffer_unsafe() :
|
||||||
* `samplesBuffer` must be followed by noisy guard band.
|
* `samplesBuffer` must be followed by noisy guard band.
|
||||||
* @return : size of dictionary.
|
* @return : size of dictionary.
|
||||||
@@ -928,53 +879,67 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|||||||
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
||||||
unsigned selectivity = params.selectivityLevel;
|
unsigned selectivity = params.selectivityLevel;
|
||||||
size_t const targetDictSize = maxDictSize;
|
size_t const targetDictSize = maxDictSize;
|
||||||
size_t sBuffSize;
|
size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
||||||
size_t dictSize = 0;
|
size_t dictSize = 0;
|
||||||
|
|
||||||
/* checks */
|
/* checks */
|
||||||
if (!dictList) return ERROR(memory_allocation);
|
if (!dictList) return ERROR(memory_allocation);
|
||||||
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
|
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
|
||||||
|
if (samplesBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
||||||
|
|
||||||
/* init */
|
/* init */
|
||||||
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
|
||||||
if (sBuffSize < DIB_MINSAMPLESSIZE) { free(dictList); return 0; } /* not enough source to create dictionary */
|
|
||||||
ZDICT_initDictItem(dictList);
|
ZDICT_initDictItem(dictList);
|
||||||
g_displayLevel = params.notificationLevel;
|
g_displayLevel = params.notificationLevel;
|
||||||
if (selectivity==0) selectivity = g_selectivity_default;
|
if (selectivity==0) selectivity = g_selectivity_default;
|
||||||
|
|
||||||
/* build dictionary */
|
/* build dictionary */
|
||||||
if (selectivity>1) { /* selectivity == 1 => fast mode */
|
ZDICT_trainBuffer(dictList, dictListSize,
|
||||||
ZDICT_trainBuffer(dictList, dictListSize,
|
samplesBuffer, samplesBuffSize,
|
||||||
samplesBuffer, sBuffSize,
|
samplesSizes, nbSamples,
|
||||||
samplesSizes, nbSamples,
|
selectivity);
|
||||||
selectivity, (U32)targetDictSize);
|
|
||||||
|
/* display best matches */
|
||||||
|
if (g_displayLevel>= 3) {
|
||||||
|
U32 const nb = 25;
|
||||||
|
U32 const dictContentSize = ZDICT_dictSize(dictList);
|
||||||
|
U32 u;
|
||||||
|
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
||||||
|
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
||||||
|
for (u=1; u<=nb; u++) {
|
||||||
|
U32 pos = dictList[u].pos;
|
||||||
|
U32 length = dictList[u].length;
|
||||||
|
U32 printedLength = MIN(40, length);
|
||||||
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
||||||
|
u, length, pos, dictList[u].savings);
|
||||||
|
ZDICT_printHex(3, (const char*)samplesBuffer+pos, printedLength);
|
||||||
|
DISPLAYLEVEL(3, "| \n");
|
||||||
|
} }
|
||||||
|
|
||||||
/* display best matches */
|
|
||||||
if (g_displayLevel>= 3) {
|
|
||||||
U32 const nb = 25;
|
|
||||||
U32 const dictContentSize = ZDICT_dictSize(dictList);
|
|
||||||
U32 u;
|
|
||||||
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
|
||||||
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
|
||||||
for (u=1; u<=nb; u++) {
|
|
||||||
U32 pos = dictList[u].pos;
|
|
||||||
U32 length = dictList[u].length;
|
|
||||||
U32 printedLength = MIN(40, length);
|
|
||||||
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
|
||||||
u, length, pos, dictList[u].savings);
|
|
||||||
ZDICT_printHex(3, (const char*)samplesBuffer+pos, printedLength);
|
|
||||||
DISPLAYLEVEL(3, "| \n");
|
|
||||||
} } }
|
|
||||||
|
|
||||||
/* create dictionary */
|
/* create dictionary */
|
||||||
{ U32 dictContentSize = ZDICT_dictSize(dictList);
|
{ U32 dictContentSize = ZDICT_dictSize(dictList);
|
||||||
U64 const totalSamplesSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
|
|
||||||
if (dictContentSize < targetDictSize/2) {
|
if (dictContentSize < targetDictSize/2) {
|
||||||
DISPLAYLEVEL(2, "! warning : created dictionary significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
|
DISPLAYLEVEL(2, "! warning : created dictionary significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
|
||||||
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
|
||||||
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
|
||||||
if (totalSamplesSize < 10 * targetDictSize)
|
if (samplesBuffSize < 10 * targetDictSize)
|
||||||
DISPLAYLEVEL(2, "! consider also increasing the number of samples (total size : %u MB)\n", (U32)(totalSamplesSize>>20));
|
DISPLAYLEVEL(2, "! consider also increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dictContentSize > targetDictSize*2) {
|
||||||
|
DISPLAYLEVEL(2, "! warning : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
|
||||||
|
DISPLAYLEVEL(2, "! consider decreasing selectivity to produce denser dictionary (-s%u) \n", selectivity-1);
|
||||||
|
DISPLAYLEVEL(2, "! test its efficiency on samples \n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* limit dictionary size */
|
||||||
|
{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */
|
||||||
|
U32 currentSize = 0;
|
||||||
|
U32 n; for (n=1; n<max; n++) {
|
||||||
|
currentSize += dictList[n].length;
|
||||||
|
if (currentSize > targetDictSize) break;
|
||||||
|
}
|
||||||
|
dictList->pos = n;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* build dict content */
|
/* build dict content */
|
||||||
@@ -987,14 +952,6 @@ size_t ZDICT_trainFromBuffer_unsafe(
|
|||||||
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
||||||
} }
|
} }
|
||||||
|
|
||||||
/* fast mode dict content */
|
|
||||||
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
|
||||||
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
|
||||||
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
|
||||||
dictContentSize = (U32)ZDICT_fastSampling(dictBuffer, targetDictSize,
|
|
||||||
samplesBuffer, sBuffSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
|
||||||
samplesBuffer, samplesSizes, nbSamples,
|
samplesBuffer, samplesSizes, nbSamples,
|
||||||
params);
|
params);
|
||||||
|
Reference in New Issue
Block a user