1
0
mirror of https://github.com/facebook/zstd.git synced 2025-07-29 11:21:22 +03:00

increase vocabulary size

makes compression a bit less good,
hence a bit more comparable with real text (though still too easy to compress).
level 6 is now stronger than level 4, by a hair.
However, there is still a ratio dip at level 5.
This commit is contained in:
Yann Collet
2024-02-20 00:12:32 -08:00
parent 889392dac8
commit 1e046ce7fa
2 changed files with 141 additions and 123 deletions

7
.gitignore vendored
View File

@ -39,12 +39,15 @@ buck-out/
build-* build-*
*.gcda *.gcda
# IDE
.clang_complete
compile_flags.txt
.clang-format
# Other files # Other files
.directory .directory
_codelite/ _codelite/
_zstdbench/ _zstdbench/
.clang_complete
compile_flags.txt
*.idea *.idea
*.swp *.swp
.DS_Store .DS_Store

View File

@ -8,7 +8,6 @@
* You may select, at your option, one of the above-listed licenses. * You may select, at your option, one of the above-listed licenses.
*/ */
/* Implementation notes: /* Implementation notes:
* *
* This is a very simple lorem ipsum generator * This is a very simple lorem ipsum generator
@ -32,83 +31,97 @@
*/ */
#include "lorem.h" #include "lorem.h"
#include <string.h> /* memcpy */
#include <limits.h> /* INT_MAX */
#include <assert.h> #include <assert.h>
#include <limits.h> /* INT_MAX */
#include <string.h> /* memcpy */
#define WORD_MAX_SIZE 20 #define WORD_MAX_SIZE 20
/* Define the word pool */ /* Define the word pool */
static const char *words[] = { static const char* words[] = {
"lorem", "ipsum", "dolor", "sit", "amet", "lorem", "ipsum", "dolor", "sit", "amet",
"consectetur", "adipiscing", "elit", "sed", "do", "consectetur", "adipiscing", "elit", "sed", "do",
"eiusmod", "tempor", "incididunt", "ut", "labore", "eiusmod", "tempor", "incididunt", "ut", "labore",
"et", "dolore", "magna", "aliqua", "dis", "et", "dolore", "magna", "aliqua", "dis",
"lectus", "vestibulum", "mattis", "ullamcorper", "velit", "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
"commodo", "a", "lacus", "arcu", "magnis", "commodo", "a", "lacus", "arcu", "magnis",
"parturient", "montes", "nascetur", "ridiculus", "mus", "parturient", "montes", "nascetur", "ridiculus", "mus",
"mauris", "nulla", "malesuada", "pellentesque", "eget", "mauris", "nulla", "malesuada", "pellentesque", "eget",
"gravida", "in", "dictum", "non", "erat", "gravida", "in", "dictum", "non", "erat",
"nam", "voluptat", "maecenas", "blandit", "aliquam", "nam", "voluptat", "maecenas", "blandit", "aliquam",
"etiam", "enim", "lobortis", "scelerisque", "fermentum", "etiam", "enim", "lobortis", "scelerisque", "fermentum",
"dui", "faucibus", "ornare", "at", "elementum", "dui", "faucibus", "ornare", "at", "elementum",
"eu", "facilisis", "odio", "morbi", "quis", "eu", "facilisis", "odio", "morbi", "quis",
"eros", "donec", "ac", "orci", "purus", "eros", "donec", "ac", "orci", "purus",
"turpis", "cursus", "leo", "vel", "porta"}; "turpis", "cursus", "leo", "vel", "porta",
"consequat", "interdum", "varius", "vulputate", "aliquet",
"pharetra", "nunc", "auctor", "urna", "id",
"metus", "viverra", "nibh", "cras", "mi",
"unde", "omnis", "iste", "natus", "error",
"perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
"totam", "rem", "aperiam", "eaque", "ipsa",
"quae", "ab", "illo", "inventore", "veritatis",
"quasi", "architecto", "beatae", "vitae", "dicta",
"sunt", "explicabo", "nemo", "ipsam", "quia",
"voluptas", "aspernatur", "aut", "odit", "fugit"
};
/* simple distribution that favors small words : /* simple 1-dimension distribution that favors small words :
* 1 letter : weight 3 * 1 letter : weight 3
* 2-3 letters : weight 2 * 2-3 letters : weight 2
* 4+ letters : weight 1 * 4+ letters : weight 1
* This is expected to be a bit more difficult to compress */ */
static const int distrib[] = { static const int distrib[] = {
0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 11,
8,9, 9, 10, 11, 12, 13, 13, 14, 15, 12, 13, 13, 14, 15, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23,
15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35,
24, 25, 26, 26, 26, 27, 28, 29, 30, 31, 36, 37, 38, 39, 40, 41, 41, 42, 43, 43, 44, 45, 45, 46, 47,
32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 48, 49, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 58, 59, 60,
41, 41, 42, 43, 43, 44, 45, 45, 46, 47, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 72,
48, 49, 50, 51, 52, 53, 54, 55, 55, 56, 73, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 85,
57, 58, 58, 59, 60, 60, 61, 62, 63, 64, 86, 87, 88, 89, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
65, 66, 67, 67, 68, 69, 70, 71, 72, 72, 100, 101, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112,
73, 73, 74 }; 113, 114, 115, 116, 117, 118, 119, 129, 121, 122, 123, 124,
};
static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]); static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]);
/* Note: this unit only works when invoked sequentially. /* Note: this unit only works when invoked sequentially.
* No concurrent access is allowed */ * No concurrent access is allowed */
static char *g_ptr = NULL; static char* g_ptr = NULL;
static size_t g_nbChars = 0; static size_t g_nbChars = 0;
static size_t g_maxChars = 10000000; static size_t g_maxChars = 10000000;
static unsigned g_randRoot = 0; static unsigned g_randRoot = 0;
#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
static unsigned LOREM_rand(unsigned range) { static unsigned LOREM_rand(unsigned range)
static const unsigned prime1 = 2654435761U; {
static const unsigned prime2 = 2246822519U; static const unsigned prime1 = 2654435761U;
unsigned rand32 = g_randRoot; static const unsigned prime2 = 2246822519U;
rand32 *= prime1; unsigned rand32 = g_randRoot;
rand32 ^= prime2; rand32 *= prime1;
rand32 = RDG_rotl32(rand32, 13); rand32 ^= prime2;
g_randRoot = rand32; rand32 = RDG_rotl32(rand32, 13);
return (unsigned)(((unsigned long long)rand32 * range) >> 32); g_randRoot = rand32;
return (unsigned)(((unsigned long long)rand32 * range) >> 32);
} }
static void writeLastCharacters(void) { static void writeLastCharacters(void)
size_t lastChars = g_maxChars - g_nbChars; {
assert(g_maxChars >= g_nbChars); size_t lastChars = g_maxChars - g_nbChars;
if (lastChars == 0) assert(g_maxChars >= g_nbChars);
return; if (lastChars == 0)
g_ptr[g_nbChars++] = '.'; return;
if (lastChars > 2) { g_ptr[g_nbChars++] = '.';
memset(g_ptr + g_nbChars, ' ', lastChars - 2); if (lastChars > 2) {
} memset(g_ptr + g_nbChars, ' ', lastChars - 2);
if (lastChars > 1) { }
g_ptr[g_maxChars-1] = '\n'; if (lastChars > 1) {
} g_ptr[g_maxChars - 1] = '\n';
g_nbChars = g_maxChars; }
g_nbChars = g_maxChars;
} }
static void generateWord(const char *word, const char *separator, int upCase) static void generateWord(const char* word, const char* separator, int upCase)
{ {
size_t const len = strlen(word) + strlen(separator); size_t const len = strlen(word) + strlen(separator);
if (g_nbChars + len > g_maxChars) { if (g_nbChars + len > g_maxChars) {
@ -118,90 +131,92 @@ static void generateWord(const char *word, const char *separator, int upCase)
memcpy(g_ptr + g_nbChars, word, strlen(word)); memcpy(g_ptr + g_nbChars, word, strlen(word));
if (upCase) { if (upCase) {
static const char toUp = 'A' - 'a'; static const char toUp = 'A' - 'a';
g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
} }
g_nbChars += strlen(word); g_nbChars += strlen(word);
memcpy(g_ptr + g_nbChars, separator, strlen(separator)); memcpy(g_ptr + g_nbChars, separator, strlen(separator));
g_nbChars += strlen(separator); g_nbChars += strlen(separator);
} }
static int about(unsigned target) { static int about(unsigned target)
return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); {
return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
} }
/* Function to generate a random sentence */ /* Function to generate a random sentence */
static void generateSentence(int nbWords) { static void generateSentence(int nbWords)
int commaPos = about(9); {
int comma2 = commaPos + about(7); int commaPos = about(9);
int i; int comma2 = commaPos + about(7);
for (i = 0; i < nbWords; i++) { int i;
int const wordID = distrib[LOREM_rand(distribCount)]; for (i = 0; i < nbWords; i++) {
const char *const word = words[wordID]; int const wordID = distrib[LOREM_rand(distribCount)];
const char* sep = " "; const char* const word = words[wordID];
if (i == commaPos) const char* sep = " ";
sep = ", "; if (i == commaPos)
if (i == comma2) sep = ", ";
sep = ", "; if (i == comma2)
if (i == nbWords - 1) sep = ", ";
sep = ". "; if (i == nbWords - 1)
generateWord(word, sep, i==0); sep = ". ";
} generateWord(word, sep, i == 0);
}
} }
static void generateParagraph(int nbSentences) { static void generateParagraph(int nbSentences)
int i; {
for (i = 0; i < nbSentences; i++) { int i;
int wordsPerSentence = about(8); for (i = 0; i < nbSentences; i++) {
generateSentence(wordsPerSentence); int wordsPerSentence = about(8);
} generateSentence(wordsPerSentence);
if (g_nbChars < g_maxChars) { }
g_ptr[g_nbChars++] = '\n'; if (g_nbChars < g_maxChars) {
} g_ptr[g_nbChars++] = '\n';
if (g_nbChars < g_maxChars) { }
g_ptr[g_nbChars++] = '\n'; if (g_nbChars < g_maxChars) {
} g_ptr[g_nbChars++] = '\n';
}
} }
/* It's "common" for lorem ipsum generators to start with the same first /* It's "common" for lorem ipsum generators to start with the same first
* pre-defined sentence */ * pre-defined sentence */
static void generateFirstSentence(void) { static void generateFirstSentence(void)
int i; {
for (i = 0; i < 18; i++) { int i;
const char *word = words[i]; for (i = 0; i < 18; i++) {
const char *separator = " "; const char* word = words[i];
if (i == 4) const char* separator = " ";
separator = ", "; if (i == 4)
if (i == 7) separator = ", ";
separator = ", "; if (i == 7)
generateWord(word, separator, i==0); separator = ", ";
} generateWord(word, separator, i == 0);
generateWord(words[18], ". ", 0); }
generateWord(words[18], ". ", 0);
} }
size_t LOREM_genBlock(void* buffer, size_t size, size_t
unsigned seed, LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
int first, int fill)
{ {
g_ptr = (char*)buffer; g_ptr = (char*)buffer;
assert(size < INT_MAX); assert(size < INT_MAX);
g_maxChars = size; g_maxChars = size;
g_nbChars = 0; g_nbChars = 0;
g_randRoot = seed; g_randRoot = seed;
if (first) { if (first) {
generateFirstSentence(); generateFirstSentence();
} }
while (g_nbChars < g_maxChars) { while (g_nbChars < g_maxChars) {
int sentencePerParagraph = about(7); int sentencePerParagraph = about(7);
generateParagraph(sentencePerParagraph); generateParagraph(sentencePerParagraph);
if (!fill) if (!fill)
break; /* only generate one paragraph in not-fill mode */ break; /* only generate one paragraph in not-fill mode */
} }
g_ptr = NULL; g_ptr = NULL;
return g_nbChars; return g_nbChars;
} }
void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
{ {
LOREM_genBlock(buffer, size, seed, 1, 1); LOREM_genBlock(buffer, size, seed, 1, 1);
} }