increase vocabulary size

makes compression a bit less good, hence a bit more comparable with real text (though still too easy to compress). level 6 is now stronger than level 4, by a hair. However, there is still a ratio dip at level 5.
2025-07-29 11:21:22 +03:00 · 2024-02-20 00:12:32 -08:00
parent 889392dac8
commit 1e046ce7fa
2 changed files with 141 additions and 123 deletions
--- a/.gitignore
+++ b/.gitignore
@ -39,12 +39,15 @@ buck-out/
 build-*
 *.gcda
 # IDE
 .clang_complete
 compile_flags.txt
 .clang-format
 # Other files
 .directory
 _codelite/
 _zstdbench/
 .clang_complete
 compile_flags.txt
 *.idea
 *.swp
 .DS_Store
--- a/programs/lorem.c
+++ b/programs/lorem.c
@ -8,7 +8,6 @@
 * You may select, at your option, one of the above-listed licenses.
 */
 /* Implementation notes:
 *
 * This is a very simple lorem ipsum generator
@ -32,83 +31,97 @@
 */
 #include "lorem.h"
 #include <string.h>  /* memcpy */
 #include <limits.h>  /* INT_MAX */
 #include <assert.h>
 #include <limits.h> /* INT_MAX */
 #include <string.h> /* memcpy */
 #define WORD_MAX_SIZE 20
 /* Define the word pool */
-static const char *words[] = {
+static const char* words[] = {
-    "lorem",       "ipsum",      "dolor",      "sit",          "amet",
+    "lorem",        "ipsum",      "dolor",       "sit",          "amet",
-    "consectetur", "adipiscing", "elit",       "sed",          "do",
+    "consectetur",  "adipiscing", "elit",        "sed",          "do",
-    "eiusmod",     "tempor",     "incididunt", "ut",           "labore",
+    "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
-    "et",          "dolore",     "magna",      "aliqua",       "dis",
+    "et",           "dolore",     "magna",       "aliqua",       "dis",
-    "lectus",      "vestibulum", "mattis",     "ullamcorper",  "velit",
+    "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
-    "commodo",     "a",          "lacus",      "arcu",         "magnis",
+    "commodo",      "a",          "lacus",       "arcu",         "magnis",
-    "parturient",  "montes",     "nascetur",   "ridiculus",    "mus",
+    "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
-    "mauris",      "nulla",      "malesuada",  "pellentesque", "eget",
+    "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
-    "gravida",     "in",         "dictum",     "non",          "erat",
+    "gravida",      "in",         "dictum",      "non",          "erat",
-    "nam",         "voluptat",   "maecenas",   "blandit",      "aliquam",
+    "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
-    "etiam",       "enim",       "lobortis",   "scelerisque",  "fermentum",
+    "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
-    "dui",         "faucibus",   "ornare",     "at",           "elementum",
+    "dui",          "faucibus",   "ornare",      "at",           "elementum",
-    "eu",          "facilisis",  "odio",       "morbi",        "quis",
+    "eu",           "facilisis",  "odio",        "morbi",        "quis",
-    "eros",        "donec",      "ac",         "orci",         "purus",
+    "eros",         "donec",      "ac",          "orci",         "purus",
-    "turpis",      "cursus",     "leo",        "vel",          "porta"};
+    "turpis",       "cursus",     "leo",         "vel",          "porta",
    "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
    "pharetra",     "nunc",       "auctor",      "urna",         "id",
    "metus",        "viverra",    "nibh",        "cras",         "mi",
    "unde",         "omnis",      "iste",        "natus",        "error",
    "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
    "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
    "quae",         "ab",         "illo",        "inventore",    "veritatis",
    "quasi",        "architecto", "beatae",      "vitae",        "dicta",
    "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
    "voluptas",     "aspernatur", "aut",         "odit",         "fugit"
 };
-/* simple distribution that favors small words :
+/* simple 1-dimension distribution that favors small words :
 * 1 letter : weight 3
 * 2-3 letters : weight 2
 * 4+ letters : weight 1
- * This is expected to be a bit more difficult to compress */
+ */
 static const int distrib[] = {
-    0, 1, 2, 3, 3, 4, 5, 6, 7, 8,
+    0,   1,   2,   3,   3,   4,   5,   6,   7,   8,   8,   9,   9,   10,  11,
-    8,9, 9, 10, 11, 12, 13, 13, 14, 15,
+    12,  13,  13,  14,  15,  15,  16,  17,  18,  19,  19,  20,  21,  22,  23,
-    15, 16, 17, 18, 19, 19, 20, 21, 22, 23,
+    24,  25,  26,  26,  26,  27,  28,  29,  30,  31,  32,  33,  34,  34,  35,
-    24, 25, 26, 26, 26, 27, 28, 29, 30, 31,
+    36,  37,  38,  39,  40,  41,  41,  42,  43,  43,  44,  45,  45,  46,  47,
-    32, 33, 34, 34, 35, 36, 37, 38, 39, 40,
+    48,  49,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  58,  59,  60,
-    41, 41, 42, 43, 43, 44, 45, 45, 46, 47,
+    60,  61,  62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  72,
-    48, 49, 50, 51, 52, 53, 54, 55, 55, 56,
+    73,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  84,  85,
-    57, 58, 58, 59, 60, 60, 61, 62, 63, 64,
+    86,  87,  88,  89,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
-    65, 66, 67, 67, 68, 69, 70, 71, 72, 72,
+    100, 101, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112,
-    73, 73, 74 };
+    113, 114, 115, 116, 117, 118, 119, 129, 121, 122, 123, 124,
 };
 static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]);
 /* Note: this unit only works when invoked sequentially.
 * No concurrent access is allowed */
-static char *g_ptr = NULL;
+static char* g_ptr         = NULL;
-static size_t g_nbChars = 0;
+static size_t g_nbChars    = 0;
-static size_t g_maxChars = 10000000;
+static size_t g_maxChars   = 10000000;
 static unsigned g_randRoot = 0;
 #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
-static unsigned LOREM_rand(unsigned range) {
+static unsigned LOREM_rand(unsigned range)
-  static const unsigned prime1 = 2654435761U;
+{
-  static const unsigned prime2 = 2246822519U;
+    static const unsigned prime1 = 2654435761U;
-  unsigned rand32 = g_randRoot;
+    static const unsigned prime2 = 2246822519U;
-  rand32 *= prime1;
+    unsigned rand32              = g_randRoot;
-  rand32 ^= prime2;
+    rand32 *= prime1;
-  rand32 = RDG_rotl32(rand32, 13);
+    rand32 ^= prime2;
-  g_randRoot = rand32;
+    rand32     = RDG_rotl32(rand32, 13);
-  return (unsigned)(((unsigned long long)rand32 * range) >> 32);
+    g_randRoot = rand32;
    return (unsigned)(((unsigned long long)rand32 * range) >> 32);
 }
-static void writeLastCharacters(void) {
+static void writeLastCharacters(void)
-  size_t lastChars = g_maxChars - g_nbChars;
+{
-  assert(g_maxChars >= g_nbChars);
+    size_t lastChars = g_maxChars - g_nbChars;
-  if (lastChars == 0)
+    assert(g_maxChars >= g_nbChars);
-    return;
+    if (lastChars == 0)
-  g_ptr[g_nbChars++] = '.';
+        return;
-  if (lastChars > 2) {
+    g_ptr[g_nbChars++] = '.';
-    memset(g_ptr + g_nbChars, ' ', lastChars - 2);
+    if (lastChars > 2) {
-  }
+        memset(g_ptr + g_nbChars, ' ', lastChars - 2);
-  if (lastChars > 1) {
+    }
-    g_ptr[g_maxChars-1] = '\n';
+    if (lastChars > 1) {
-  }
+        g_ptr[g_maxChars - 1] = '\n';
-  g_nbChars = g_maxChars;
+    }
    g_nbChars = g_maxChars;
 }
-static void generateWord(const char *word, const char *separator, int upCase)
+static void generateWord(const char* word, const char* separator, int upCase)
 {
    size_t const len = strlen(word) + strlen(separator);
    if (g_nbChars + len > g_maxChars) {
@ -118,90 +131,92 @@ static void generateWord(const char *word, const char *separator, int upCase)
    memcpy(g_ptr + g_nbChars, word, strlen(word));
    if (upCase) {
        static const char toUp = 'A' - 'a';
-        g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
+        g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
    }
    g_nbChars += strlen(word);
    memcpy(g_ptr + g_nbChars, separator, strlen(separator));
    g_nbChars += strlen(separator);
 }
-static int about(unsigned target) {
+static int about(unsigned target)
-  return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
+{
    return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
 }
 /* Function to generate a random sentence */
-static void generateSentence(int nbWords) {
+static void generateSentence(int nbWords)
-  int commaPos = about(9);
+{
-  int comma2 = commaPos + about(7);
+    int commaPos = about(9);
-  int i;
+    int comma2   = commaPos + about(7);
-  for (i = 0; i < nbWords; i++) {
+    int i;
-    int const wordID = distrib[LOREM_rand(distribCount)];
+    for (i = 0; i < nbWords; i++) {
-    const char *const word = words[wordID];
+        int const wordID       = distrib[LOREM_rand(distribCount)];
-    const char* sep = " ";
+        const char* const word = words[wordID];
-    if (i == commaPos)
+        const char* sep        = " ";
-      sep = ", ";
+        if (i == commaPos)
-    if (i == comma2)
+            sep = ", ";
-      sep = ", ";
+        if (i == comma2)
-    if (i == nbWords - 1)
+            sep = ", ";
-      sep = ". ";
+        if (i == nbWords - 1)
-    generateWord(word, sep, i==0);
+            sep = ". ";
-  }
+        generateWord(word, sep, i == 0);
    }
 }
-static void generateParagraph(int nbSentences) {
+static void generateParagraph(int nbSentences)
-  int i;
+{
-  for (i = 0; i < nbSentences; i++) {
+    int i;
-    int wordsPerSentence = about(8);
+    for (i = 0; i < nbSentences; i++) {
-    generateSentence(wordsPerSentence);
+        int wordsPerSentence = about(8);
-  }
+        generateSentence(wordsPerSentence);
-  if (g_nbChars < g_maxChars) {
+    }
-    g_ptr[g_nbChars++] = '\n';
+    if (g_nbChars < g_maxChars) {
-  }
+        g_ptr[g_nbChars++] = '\n';
-  if (g_nbChars < g_maxChars) {
+    }
-    g_ptr[g_nbChars++] = '\n';
+    if (g_nbChars < g_maxChars) {
-  }
+        g_ptr[g_nbChars++] = '\n';
    }
 }
 /* It's "common" for lorem ipsum generators to start with the same first
 * pre-defined sentence */
-static void generateFirstSentence(void) {
+static void generateFirstSentence(void)
-  int i;
+{
-  for (i = 0; i < 18; i++) {
+    int i;
-    const char *word = words[i];
+    for (i = 0; i < 18; i++) {
-    const char *separator = " ";
+        const char* word      = words[i];
-    if (i == 4)
+        const char* separator = " ";
-      separator = ", ";
+        if (i == 4)
-    if (i == 7)
+            separator = ", ";
-      separator = ", ";
+        if (i == 7)
-    generateWord(word, separator, i==0);
+            separator = ", ";
-  }
+        generateWord(word, separator, i == 0);
-  generateWord(words[18], ". ", 0);
+    }
    generateWord(words[18], ". ", 0);
 }
-size_t LOREM_genBlock(void* buffer, size_t size,
+size_t
-                      unsigned seed,
+LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
                      int first, int fill)
 {
-  g_ptr = (char*)buffer;
+    g_ptr = (char*)buffer;
-  assert(size < INT_MAX);
+    assert(size < INT_MAX);
-  g_maxChars = size;
+    g_maxChars = size;
-  g_nbChars = 0;
+    g_nbChars  = 0;
-  g_randRoot = seed;
+    g_randRoot = seed;
-  if (first) {
+    if (first) {
-    generateFirstSentence();
+        generateFirstSentence();
-  }
+    }
-  while (g_nbChars < g_maxChars) {
+    while (g_nbChars < g_maxChars) {
-    int sentencePerParagraph = about(7);
+        int sentencePerParagraph = about(7);
-    generateParagraph(sentencePerParagraph);
+        generateParagraph(sentencePerParagraph);
-    if (!fill)
+        if (!fill)
-      break; /* only generate one paragraph in not-fill mode */
+            break; /* only generate one paragraph in not-fill mode */
-  }
+    }
-  g_ptr = NULL;
+    g_ptr = NULL;
-  return g_nbChars;
+    return g_nbChars;
 }
 void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
 {
-  LOREM_genBlock(buffer, size, seed, 1, 1);
+    LOREM_genBlock(buffer, size, seed, 1, 1);
 }