runtime weight distribution table

and made small words a bit more common.
2025-07-29 11:21:22 +03:00 · 2024-02-20 12:26:37 -08:00
parent 5a1bb4a4e0
commit 3dbd861b7d
1 changed files with 57 additions and 26 deletions
--- a/programs/lorem.c
+++ b/programs/lorem.c
@ -38,7 +38,7 @@
 #define WORD_MAX_SIZE 20

 /* Define the word pool */
-static const char* words[] = {
+static const char* kWords[] = {
    "lorem",        "ipsum",      "dolor",       "sit",          "amet",
    "consectetur",  "adipiscing", "elit",        "sed",          "do",
    "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
@ -75,29 +75,56 @@ static const char* words[] = {
    "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
    "quo",          "pariatur"
 };
+static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);

-/* simple 1-dimension distribution that favors small words :
- * 1 letter : weight 3
- * 2-3 letters : weight 2
- * 4+ letters : weight 1
+/* simple 1-dimension distribution, based on word's length, favors small words
 */
-static const int distrib[] = {
-    0,   1,   2,   3,   3,   4,   5,   6,   7,   8,   8,   9,   9,   10,  11,
-    12,  13,  13,  14,  15,  15,  16,  17,  18,  19,  19,  20,  21,  22,  23,
-    24,  25,  26,  26,  26,  27,  28,  29,  30,  31,  32,  33,  34,  34,  35,
-    36,  37,  38,  39,  40,  41,  41,  42,  43,  43,  44,  45,  45,  46,  47,
-    48,  49,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  58,  59,  60,
-    60,  61,  62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  72,
-    73,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  84,  85,
-    86,  87,  88,  89,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
-    100, 101, 101, 102, 103, 104, 105, 106, 106, 107, 108, 109, 110, 111, 112,
-    113, 114, 115, 116, 117, 118, 119, 129, 121, 122, 123, 124, 125, 126, 127,
-    128, 128, 129, 129, 130, 131, 132, 133, 134, 135, 136, 136, 137, 138, 139,
-    140, 141, 142, 143, 144, 145, 146, 146, 147, 148, 149, 150, 151, 152, 153,
-    154, 155, 156, 156, 157, 157, 158, 159, 160, 161, 161, 162, 163, 164, 165,
-    166, 167, 168, 169, 170, 170, 171,
-};
-static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]);
+static const int kWeights[]      = { 0, 8, 6, 4, 3, 2 };
+static const unsigned kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
+
+#define DISTRIB_SIZE_MAX 500
+static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
+static unsigned g_distribCount         = 0;
+
+static void countFreqs(
+        const char* words[],
+        size_t nbWords,
+        const int* weights,
+        unsigned long nbWeights)
+{
+    unsigned total = 0;
+    size_t w;
+    for (w = 0; w < nbWords; w++) {
+        unsigned long len = strlen(words[w]);
+        int lmax;
+        if (len >= nbWeights)
+            len = nbWeights - 1;
+        lmax = weights[len];
+        total += (unsigned)lmax;
+    }
+    g_distribCount = total;
+    assert(g_distribCount <= DISTRIB_SIZE_MAX);
+}
+
+static void init_word_distrib(
+        const char* words[],
+        size_t nbWords,
+        const int* weights,
+        unsigned long nbWeights)
+{
+    size_t w, d = 0;
+    countFreqs(words, nbWords, weights, nbWeights);
+    for (w = 0; w < nbWords; w++) {
+        unsigned long len = strlen(words[w]);
+        int l, lmax;
+        if (len >= nbWeights)
+            len = nbWeights - 1;
+        lmax = weights[len];
+        for (l = 0; l < lmax; l++) {
+            g_distrib[d++] = (int)w;
+        }
+    }
+}

 /* Note: this unit only works when invoked sequentially.
 * No concurrent access is allowed */
@ -166,8 +193,8 @@ static void generateSentence(int nbWords)
    const char* endSep = qmark ? "? " : ". ";
    int i;
    for (i = 0; i < nbWords; i++) {
-        int const wordID       = distrib[LOREM_rand(distribCount)];
-        const char* const word = words[wordID];
+        int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
+        const char* const word = kWords[wordID];
        const char* sep        = " ";
        if (i == commaPos)
            sep = ", ";
@ -200,7 +227,7 @@ static void generateFirstSentence(void)
 {
    int i;
    for (i = 0; i < 18; i++) {
-        const char* word      = words[i];
+        const char* word      = kWords[i];
        const char* separator = " ";
        if (i == 4)
            separator = ", ";
@ -208,7 +235,7 @@ static void generateFirstSentence(void)
            separator = ", ";
        generateWord(word, separator, i == 0);
    }
-    generateWord(words[18], ". ", 0);
+    generateWord(kWords[18], ". ", 0);
 }

 size_t
@ -219,6 +246,10 @@ LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
    g_maxChars = size;
    g_nbChars  = 0;
    g_randRoot = seed;
+    if (g_distribCount == 0) {
+        init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
+    }
+
    if (first) {
        generateFirstSentence();
    }