mirror of
https://github.com/facebook/zstd.git
synced 2025-07-29 11:21:22 +03:00
runtime weight distribution table
and made small words a bit more common.
This commit is contained in:
@ -38,7 +38,7 @@
|
||||
#define WORD_MAX_SIZE 20
|
||||
|
||||
/* Define the word pool */
|
||||
static const char* words[] = {
|
||||
static const char* kWords[] = {
|
||||
"lorem", "ipsum", "dolor", "sit", "amet",
|
||||
"consectetur", "adipiscing", "elit", "sed", "do",
|
||||
"eiusmod", "tempor", "incididunt", "ut", "labore",
|
||||
@ -75,29 +75,56 @@ static const char* words[] = {
|
||||
"quam", "nihil", "molestiae", "illum", "fugiat",
|
||||
"quo", "pariatur"
|
||||
};
|
||||
static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
|
||||
|
||||
/* simple 1-dimension distribution that favors small words :
|
||||
* 1 letter : weight 3
|
||||
* 2-3 letters : weight 2
|
||||
* 4+ letters : weight 1
|
||||
/* simple 1-dimension distribution, based on word's length, favors small words
|
||||
*/
|
||||
static const int distrib[] = {
|
||||
0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 11,
|
||||
12, 13, 13, 14, 15, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 26, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35,
|
||||
36, 37, 38, 39, 40, 41, 41, 42, 43, 43, 44, 45, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 58, 59, 60,
|
||||
60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 72,
|
||||
73, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 85,
|
||||
86, 87, 88, 89, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
|
||||
100, 101, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112,
|
||||
113, 114, 115, 116, 117, 118, 119, 129, 121, 122, 123, 124, 125, 126, 127,
|
||||
128, 128, 129, 129, 130, 131, 132, 133, 134, 135, 136, 136, 137, 138, 139,
|
||||
140, 141, 142, 143, 144, 145, 146, 146, 147, 148, 149, 150, 151, 152, 153,
|
||||
154, 155, 156, 156, 157, 157, 158, 159, 160, 161, 161, 162, 163, 164, 165,
|
||||
166, 167, 168, 169, 170, 170, 171,
|
||||
};
|
||||
static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]);
|
||||
static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
|
||||
static const unsigned kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
|
||||
|
||||
#define DISTRIB_SIZE_MAX 500
|
||||
static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
|
||||
static unsigned g_distribCount = 0;
|
||||
|
||||
static void countFreqs(
|
||||
const char* words[],
|
||||
size_t nbWords,
|
||||
const int* weights,
|
||||
unsigned long nbWeights)
|
||||
{
|
||||
unsigned total = 0;
|
||||
size_t w;
|
||||
for (w = 0; w < nbWords; w++) {
|
||||
unsigned long len = strlen(words[w]);
|
||||
int lmax;
|
||||
if (len >= nbWeights)
|
||||
len = nbWeights - 1;
|
||||
lmax = weights[len];
|
||||
total += (unsigned)lmax;
|
||||
}
|
||||
g_distribCount = total;
|
||||
assert(g_distribCount <= DISTRIB_SIZE_MAX);
|
||||
}
|
||||
|
||||
static void init_word_distrib(
|
||||
const char* words[],
|
||||
size_t nbWords,
|
||||
const int* weights,
|
||||
unsigned long nbWeights)
|
||||
{
|
||||
size_t w, d = 0;
|
||||
countFreqs(words, nbWords, weights, nbWeights);
|
||||
for (w = 0; w < nbWords; w++) {
|
||||
unsigned long len = strlen(words[w]);
|
||||
int l, lmax;
|
||||
if (len >= nbWeights)
|
||||
len = nbWeights - 1;
|
||||
lmax = weights[len];
|
||||
for (l = 0; l < lmax; l++) {
|
||||
g_distrib[d++] = (int)w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Note: this unit only works when invoked sequentially.
|
||||
* No concurrent access is allowed */
|
||||
@ -166,8 +193,8 @@ static void generateSentence(int nbWords)
|
||||
const char* endSep = qmark ? "? " : ". ";
|
||||
int i;
|
||||
for (i = 0; i < nbWords; i++) {
|
||||
int const wordID = distrib[LOREM_rand(distribCount)];
|
||||
const char* const word = words[wordID];
|
||||
int const wordID = g_distrib[LOREM_rand(g_distribCount)];
|
||||
const char* const word = kWords[wordID];
|
||||
const char* sep = " ";
|
||||
if (i == commaPos)
|
||||
sep = ", ";
|
||||
@ -200,7 +227,7 @@ static void generateFirstSentence(void)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < 18; i++) {
|
||||
const char* word = words[i];
|
||||
const char* word = kWords[i];
|
||||
const char* separator = " ";
|
||||
if (i == 4)
|
||||
separator = ", ";
|
||||
@ -208,7 +235,7 @@ static void generateFirstSentence(void)
|
||||
separator = ", ";
|
||||
generateWord(word, separator, i == 0);
|
||||
}
|
||||
generateWord(words[18], ". ", 0);
|
||||
generateWord(kWords[18], ". ", 0);
|
||||
}
|
||||
|
||||
size_t
|
||||
@ -219,6 +246,10 @@ LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
|
||||
g_maxChars = size;
|
||||
g_nbChars = 0;
|
||||
g_randRoot = seed;
|
||||
if (g_distribCount == 0) {
|
||||
init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
|
||||
}
|
||||
|
||||
if (first) {
|
||||
generateFirstSentence();
|
||||
}
|
||||
|
Reference in New Issue
Block a user