mirror of
				https://github.com/facebook/zstd.git
				synced 2025-11-03 20:33:11 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			286 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			286 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 | 
						|
 * All rights reserved.
 | 
						|
 *
 | 
						|
 * This source code is licensed under both the BSD-style license (found in the
 | 
						|
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 | 
						|
 * in the COPYING file in the root directory of this source tree).
 | 
						|
 * You may select, at your option, one of the above-listed licenses.
 | 
						|
 */
 | 
						|
 | 
						|
/* Implementation notes:
 | 
						|
 *
 | 
						|
 * This is a very simple lorem ipsum generator
 | 
						|
 * which features a static list of words
 | 
						|
 * and print them one after another randomly
 | 
						|
 * with a fake sentence / paragraph structure.
 | 
						|
 *
 | 
						|
 * The goal is to generate a printable text
 | 
						|
 * that can be used to fake a text compression scenario.
 | 
						|
 * The resulting compression / ratio curve of the lorem ipsum generator
 | 
						|
 * is more satisfying than the previous statistical generator,
 | 
						|
 * which was initially designed for entropy compression,
 | 
						|
 * and lacks a regularity more representative of text.
 | 
						|
 *
 | 
						|
 * The compression ratio achievable on the generated lorem ipsum
 | 
						|
 * is still a bit too good, presumably because the dictionary is a bit too
 | 
						|
 * small. It would be possible to create some more complex scheme, notably by
 | 
						|
 * enlarging the dictionary with a word generator, and adding grammatical rules
 | 
						|
 * (composition) and syntax rules. But that's probably overkill for the intended
 | 
						|
 * goal.
 | 
						|
 */
 | 
						|
 | 
						|
#include "lorem.h"
 | 
						|
#include <assert.h>
 | 
						|
#include <limits.h> /* INT_MAX */
 | 
						|
#include <string.h> /* memcpy */
 | 
						|
 | 
						|
#define WORD_MAX_SIZE 20
 | 
						|
 | 
						|
/* Define the word pool */
 | 
						|
static const char* kWords[] = {
 | 
						|
    "lorem",        "ipsum",      "dolor",       "sit",          "amet",
 | 
						|
    "consectetur",  "adipiscing", "elit",        "sed",          "do",
 | 
						|
    "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
 | 
						|
    "et",           "dolore",     "magna",       "aliqua",       "dis",
 | 
						|
    "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
 | 
						|
    "commodo",      "a",          "lacus",       "arcu",         "magnis",
 | 
						|
    "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
 | 
						|
    "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
 | 
						|
    "gravida",      "in",         "dictum",      "non",          "erat",
 | 
						|
    "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
 | 
						|
    "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
 | 
						|
    "dui",          "faucibus",   "ornare",      "at",           "elementum",
 | 
						|
    "eu",           "facilisis",  "odio",        "morbi",        "quis",
 | 
						|
    "eros",         "donec",      "ac",          "orci",         "purus",
 | 
						|
    "turpis",       "cursus",     "leo",         "vel",          "porta",
 | 
						|
    "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
 | 
						|
    "pharetra",     "nunc",       "auctor",      "urna",         "id",
 | 
						|
    "metus",        "viverra",    "nibh",        "cras",         "mi",
 | 
						|
    "unde",         "omnis",      "iste",        "natus",        "error",
 | 
						|
    "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
 | 
						|
    "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
 | 
						|
    "quae",         "ab",         "illo",        "inventore",    "veritatis",
 | 
						|
    "quasi",        "architecto", "beatae",      "vitae",        "dicta",
 | 
						|
    "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
 | 
						|
    "voluptas",     "aspernatur", "aut",         "odit",         "fugit",
 | 
						|
    "consequuntur", "magni",      "dolores",     "eos",          "qui",
 | 
						|
    "ratione",      "sequi",      "nesciunt",    "neque",        "porro",
 | 
						|
    "quisquam",     "est",        "dolorem",     "adipisci",     "numquam",
 | 
						|
    "eius",         "modi",       "tempora",     "incidunt",     "magnam",
 | 
						|
    "quaerat",      "ad",         "minima",      "veniam",       "nostrum",
 | 
						|
    "ullam",        "corporis",   "suscipit",    "laboriosam",   "nisi",
 | 
						|
    "aliquid",      "ex",         "ea",          "commodi",      "consequatur",
 | 
						|
    "autem",        "eum",        "iure",        "voluptate",    "esse",
 | 
						|
    "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
 | 
						|
    "quo",          "pariatur",   "vero",        "accusamus",    "iusto",
 | 
						|
    "dignissimos",  "ducimus",    "blanditiis",  "praesentium",  "voluptatum",
 | 
						|
    "deleniti",     "atque",      "corrupti",    "quos",         "quas",
 | 
						|
    "molestias",    "excepturi",  "sint",        "occaecati",    "cupiditate",
 | 
						|
    "provident",    "similique",  "culpa",       "officia",      "deserunt",
 | 
						|
    "mollitia",     "animi",      "laborum",     "dolorum",      "fuga",
 | 
						|
    "harum",        "quidem",     "rerum",       "facilis",      "expedita",
 | 
						|
    "distinctio",   "libero",     "tempore",     "cum",          "soluta",
 | 
						|
    "nobis",        "eligendi",   "optio",       "cumque",       "impedit",
 | 
						|
    "minus",        "quod",       "maxime",      "placeat",      "facere",
 | 
						|
    "possimus",     "assumenda",  "repellendus", "temporibus",   "quibusdam",
 | 
						|
    "officiis",     "debitis",    "saepe",       "eveniet",      "voluptates",
 | 
						|
    "repudiandae",  "recusandae", "itaque",      "earum",        "hic",
 | 
						|
    "tenetur",      "sapiente",   "delectus",    "reiciendis",   "cillum",
 | 
						|
    "maiores",      "alias",      "perferendis", "doloribus",    "asperiores",
 | 
						|
    "repellat",     "minim",      "nostrud",     "exercitation", "ullamco",
 | 
						|
    "laboris",      "aliquip",    "duis",        "aute",         "irure",
 | 
						|
};
 | 
						|
static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
 | 
						|
 | 
						|
/* simple 1-dimension distribution, based on word's length, favors small words
 | 
						|
 */
 | 
						|
static const int kWeights[]    = { 0, 8, 6, 4, 3, 2 };
 | 
						|
static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
 | 
						|
 | 
						|
#define DISTRIB_SIZE_MAX 650
 | 
						|
static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
 | 
						|
static unsigned g_distribCount         = 0;
 | 
						|
 | 
						|
static void countFreqs(
 | 
						|
        const char* words[],
 | 
						|
        size_t nbWords,
 | 
						|
        const int* weights,
 | 
						|
        size_t nbWeights)
 | 
						|
{
 | 
						|
    unsigned total = 0;
 | 
						|
    size_t w;
 | 
						|
    for (w = 0; w < nbWords; w++) {
 | 
						|
        size_t len = strlen(words[w]);
 | 
						|
        int lmax;
 | 
						|
        if (len >= nbWeights)
 | 
						|
            len = nbWeights - 1;
 | 
						|
        lmax = weights[len];
 | 
						|
        total += (unsigned)lmax;
 | 
						|
    }
 | 
						|
    g_distribCount = total;
 | 
						|
    assert(g_distribCount <= DISTRIB_SIZE_MAX);
 | 
						|
}
 | 
						|
 | 
						|
static void init_word_distrib(
 | 
						|
        const char* words[],
 | 
						|
        size_t nbWords,
 | 
						|
        const int* weights,
 | 
						|
        size_t nbWeights)
 | 
						|
{
 | 
						|
    size_t w, d = 0;
 | 
						|
    countFreqs(words, nbWords, weights, nbWeights);
 | 
						|
    for (w = 0; w < nbWords; w++) {
 | 
						|
        size_t len = strlen(words[w]);
 | 
						|
        int l, lmax;
 | 
						|
        if (len >= nbWeights)
 | 
						|
            len = nbWeights - 1;
 | 
						|
        lmax = weights[len];
 | 
						|
        for (l = 0; l < lmax; l++) {
 | 
						|
            g_distrib[d++] = (int)w;
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/* Note: this unit only works when invoked sequentially.
 | 
						|
 * No concurrent access is allowed */
 | 
						|
static char* g_ptr         = NULL;
 | 
						|
static size_t g_nbChars    = 0;
 | 
						|
static size_t g_maxChars   = 10000000;
 | 
						|
static unsigned g_randRoot = 0;
 | 
						|
 | 
						|
#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
 | 
						|
static unsigned LOREM_rand(unsigned range)
 | 
						|
{
 | 
						|
    static const unsigned prime1 = 2654435761U;
 | 
						|
    static const unsigned prime2 = 2246822519U;
 | 
						|
    unsigned rand32              = g_randRoot;
 | 
						|
    rand32 *= prime1;
 | 
						|
    rand32 ^= prime2;
 | 
						|
    rand32     = RDG_rotl32(rand32, 13);
 | 
						|
    g_randRoot = rand32;
 | 
						|
    return (unsigned)(((unsigned long long)rand32 * range) >> 32);
 | 
						|
}
 | 
						|
 | 
						|
static void writeLastCharacters(void)
 | 
						|
{
 | 
						|
    size_t lastChars = g_maxChars - g_nbChars;
 | 
						|
    assert(g_maxChars >= g_nbChars);
 | 
						|
    if (lastChars == 0)
 | 
						|
        return;
 | 
						|
    g_ptr[g_nbChars++] = '.';
 | 
						|
    if (lastChars > 2) {
 | 
						|
        memset(g_ptr + g_nbChars, ' ', lastChars - 2);
 | 
						|
    }
 | 
						|
    if (lastChars > 1) {
 | 
						|
        g_ptr[g_maxChars - 1] = '\n';
 | 
						|
    }
 | 
						|
    g_nbChars = g_maxChars;
 | 
						|
}
 | 
						|
 | 
						|
static void generateWord(const char* word, const char* separator, int upCase)
 | 
						|
{
 | 
						|
    size_t const len = strlen(word) + strlen(separator);
 | 
						|
    if (g_nbChars + len > g_maxChars) {
 | 
						|
        writeLastCharacters();
 | 
						|
        return;
 | 
						|
    }
 | 
						|
    memcpy(g_ptr + g_nbChars, word, strlen(word));
 | 
						|
    if (upCase) {
 | 
						|
        static const char toUp = 'A' - 'a';
 | 
						|
        g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
 | 
						|
    }
 | 
						|
    g_nbChars += strlen(word);
 | 
						|
    memcpy(g_ptr + g_nbChars, separator, strlen(separator));
 | 
						|
    g_nbChars += strlen(separator);
 | 
						|
}
 | 
						|
 | 
						|
static int about(unsigned target)
 | 
						|
{
 | 
						|
    return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
 | 
						|
}
 | 
						|
 | 
						|
/* Function to generate a random sentence */
 | 
						|
static void generateSentence(int nbWords)
 | 
						|
{
 | 
						|
    int commaPos       = about(9);
 | 
						|
    int comma2         = commaPos + about(7);
 | 
						|
    int qmark          = (LOREM_rand(11) == 7);
 | 
						|
    const char* endSep = qmark ? "? " : ". ";
 | 
						|
    int i;
 | 
						|
    for (i = 0; i < nbWords; i++) {
 | 
						|
        int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
 | 
						|
        const char* const word = kWords[wordID];
 | 
						|
        const char* sep        = " ";
 | 
						|
        if (i == commaPos)
 | 
						|
            sep = ", ";
 | 
						|
        if (i == comma2)
 | 
						|
            sep = ", ";
 | 
						|
        if (i == nbWords - 1)
 | 
						|
            sep = endSep;
 | 
						|
        generateWord(word, sep, i == 0);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
static void generateParagraph(int nbSentences)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
    for (i = 0; i < nbSentences; i++) {
 | 
						|
        int wordsPerSentence = about(11);
 | 
						|
        generateSentence(wordsPerSentence);
 | 
						|
    }
 | 
						|
    if (g_nbChars < g_maxChars) {
 | 
						|
        g_ptr[g_nbChars++] = '\n';
 | 
						|
    }
 | 
						|
    if (g_nbChars < g_maxChars) {
 | 
						|
        g_ptr[g_nbChars++] = '\n';
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/* It's "common" for lorem ipsum generators to start with the same first
 | 
						|
 * pre-defined sentence */
 | 
						|
static void generateFirstSentence(void)
 | 
						|
{
 | 
						|
    int i;
 | 
						|
    for (i = 0; i < 18; i++) {
 | 
						|
        const char* word      = kWords[i];
 | 
						|
        const char* separator = " ";
 | 
						|
        if (i == 4)
 | 
						|
            separator = ", ";
 | 
						|
        if (i == 7)
 | 
						|
            separator = ", ";
 | 
						|
        generateWord(word, separator, i == 0);
 | 
						|
    }
 | 
						|
    generateWord(kWords[18], ". ", 0);
 | 
						|
}
 | 
						|
 | 
						|
size_t
 | 
						|
LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
 | 
						|
{
 | 
						|
    g_ptr = (char*)buffer;
 | 
						|
    assert(size < INT_MAX);
 | 
						|
    g_maxChars = size;
 | 
						|
    g_nbChars  = 0;
 | 
						|
    g_randRoot = seed;
 | 
						|
    if (g_distribCount == 0) {
 | 
						|
        init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
 | 
						|
    }
 | 
						|
 | 
						|
    if (first) {
 | 
						|
        generateFirstSentence();
 | 
						|
    }
 | 
						|
    while (g_nbChars < g_maxChars) {
 | 
						|
        int sentencePerParagraph = about(7);
 | 
						|
        generateParagraph(sentencePerParagraph);
 | 
						|
        if (!fill)
 | 
						|
            break; /* only generate one paragraph in not-fill mode */
 | 
						|
    }
 | 
						|
    g_ptr = NULL;
 | 
						|
    return g_nbChars;
 | 
						|
}
 | 
						|
 | 
						|
void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
 | 
						|
{
 | 
						|
    LOREM_genBlock(buffer, size, seed, 1, 1);
 | 
						|
}
 |