From d0b7da30e26406c7ece2bf538a70410e80b9de9f Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 29 Jan 2024 15:00:32 -0800 Subject: [PATCH] add a lorem ipsum generator this generator replaces the statistical generator for the general case when no statistic is requested. Generated data features a compression level speed / ratio curve which is more in line with expectation. --- programs/benchzstd.c | 17 ++-- programs/benchzstd.h | 11 +-- programs/lorem.c | 207 +++++++++++++++++++++++++++++++++++++++++++ programs/lorem.h | 32 +++++++ programs/zstdcli.c | 4 +- 5 files changed, 259 insertions(+), 12 deletions(-) create mode 100644 programs/lorem.c create mode 100644 programs/lorem.h diff --git a/programs/benchzstd.c b/programs/benchzstd.c index 9bc3628ee..b3af4c332 100644 --- a/programs/benchzstd.c +++ b/programs/benchzstd.c @@ -32,12 +32,13 @@ #include "benchfn.h" #include "../lib/common/mem.h" #ifndef ZSTD_STATIC_LINKING_ONLY -#define ZSTD_STATIC_LINKING_ONLY +# define ZSTD_STATIC_LINKING_ONLY #endif #include "../lib/zstd.h" #include "datagen.h" /* RDG_genBuffer */ +#include "lorem.h" /* LOREM_genBuffer */ #ifndef XXH_INLINE_ALL -#define XXH_INLINE_ALL +# define XXH_INLINE_ALL #endif #include "../lib/common/xxhash.h" #include "benchzstd.h" @@ -701,7 +702,8 @@ int BMK_syntheticTest(int cLevel, double compressibility, const ZSTD_compressionParameters* compressionParams, int displayLevel, const BMK_advancedParams_t* adv) { - char name[20] = {0}; + char nameBuff[20] = {0}; + const char* name = nameBuff; size_t const benchedSize = 10000000; void* srcBuffer; BMK_benchOutcome_t res; @@ -719,10 +721,15 @@ int BMK_syntheticTest(int cLevel, double compressibility, } /* Fill input buffer */ - RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0); + if (compressibility < 0.0) { + LOREM_genBuffer(srcBuffer, benchedSize, 0); + name = "Lorem ipsum"; + } else { + RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0); + snprintf (nameBuff, sizeof(nameBuff), "Synthetic %2u%%", (unsigned)(compressibility*100)); + } /* Bench */ - snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100)); res = BMK_benchCLevel(srcBuffer, benchedSize, &benchedSize /* ? */, 1 /* ? */, cLevel, compressionParams, diff --git a/programs/benchzstd.h b/programs/benchzstd.h index f14a68192..cdb6101c2 100644 --- a/programs/benchzstd.h +++ b/programs/benchzstd.h @@ -126,11 +126,12 @@ int BMK_benchFilesAdvanced( /*! BMK_syntheticTest() -- called from zstdcli */ /* Generates a sample with datagen, using compressibility argument */ -/* cLevel - compression level to benchmark, errors if invalid - * compressibility - determines compressibility of sample - * compressionParams - basic compression Parameters - * displayLevel - see benchFiles - * adv - see advanced_Params_t +/* @cLevel - compression level to benchmark, errors if invalid + * @compressibility - determines compressibility of sample, range [0.0 - 1.0] + * if @compressibility < 0.0, uses the lorem ipsum generator + * @compressionParams - basic compression Parameters + * @displayLevel - see benchFiles + * @adv - see advanced_Params_t * @return: 0 on success, !0 on error */ int BMK_syntheticTest(int cLevel, double compressibility, diff --git a/programs/lorem.c b/programs/lorem.c new file mode 100644 index 000000000..59dd6da62 --- /dev/null +++ b/programs/lorem.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* Implementation notes: + * + * This is a very simple lorem ipsum generator + * which features a static list of words + * and print them one after another randomly + * with a fake sentence / paragraph structure. + * + * The goal is to generate a printable text + * that can be used to fake a text compression scenario. + * The resulting compression / ratio curve of the lorem ipsum generator + * is more satisfying than the previous statistical generator, + * which was initially designed for entropy compression, + * and lacks a regularity more representative of text. + * + * The compression ratio achievable on the generated lorem ipsum + * is still a bit too good, presumably because the dictionary is too small. + * It would be possible to create some more complex scheme, + * notably by enlarging the dictionary with a word generator, + * and adding grammatical rules (composition) and syntax rules. + * But that's probably overkill for the intended goal. + */ + +#include "lorem.h" +#include /* memcpy */ +#include /* INT_MAX */ +#include + +#define WORD_MAX_SIZE 20 + +/* Define the word pool */ +static const char *words[] = { + "lorem", "ipsum", "dolor", "sit", "amet", + "consectetur", "adipiscing", "elit", "sed", "do", + "eiusmod", "tempor", "incididunt", "ut", "labore", + "et", "dolore", "magna", "aliqua", "dis", + "lectus", "vestibulum", "mattis", "ullamcorper", "velit", + "commodo", "a", "lacus", "arcu", "magnis", + "parturient", "montes", "nascetur", "ridiculus", "mus", + "mauris", "nulla", "malesuada", "pellentesque", "eget", + "gravida", "in", "dictum", "non", "erat", + "nam", "voluptat", "maecenas", "blandit", "aliquam", + "etiam", "enim", "lobortis", "scelerisque", "fermentum", + "dui", "faucibus", "ornare", "at", "elementum", + "eu", "facilisis", "odio", "morbi", "quis", + "eros", "donec", "ac", "orci", "purus", + "turpis", "cursus", "leo", "vel", "porta"}; + +/* simple distribution that favors small words : + * 1 letter : weight 3 + * 2-3 letters : weight 2 + * 4+ letters : weight 1 + * This is expected to be a bit more difficult to compress */ +static const int distrib[] = { + 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, + 8,9, 9, 10, 11, 12, 13, 13, 14, 15, + 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, + 24, 25, 26, 26, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, + 41, 41, 42, 43, 43, 44, 45, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 55, 56, + 57, 58, 58, 59, 60, 60, 61, 62, 63, 64, + 65, 66, 67, 67, 68, 69, 70, 71, 72, 72, + 73, 73, 74 }; +static const unsigned distribCount = sizeof(distrib) / sizeof(distrib[0]); + +/* Note: this unit only works when invoked sequentially. + * No concurrent access is allowed */ +static char *g_ptr = NULL; +static size_t g_nbChars = 0; +static size_t g_maxChars = 10000000; +static unsigned g_randRoot = 0; + +#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) +static unsigned LOREM_rand(unsigned range) { + static const unsigned prime1 = 2654435761U; + static const unsigned prime2 = 2246822519U; + unsigned rand32 = g_randRoot; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = RDG_rotl32(rand32, 13); + g_randRoot = rand32; + return (unsigned)(((unsigned long long)rand32 * range) >> 32); +} + +static void writeLastCharacters(void) { + size_t lastChars = g_maxChars - g_nbChars; + assert(g_maxChars >= g_nbChars); + if (lastChars == 0) + return; + g_ptr[g_nbChars++] = '.'; + if (lastChars > 2) { + memset(g_ptr + g_nbChars, ' ', lastChars - 2); + } + if (lastChars > 1) { + g_ptr[g_maxChars-1] = '\n'; + } + g_nbChars = g_maxChars; +} + +static void generateWord(const char *word, const char *separator, int upCase) +{ + size_t const len = strlen(word) + strlen(separator); + if (g_nbChars + len > g_maxChars) { + writeLastCharacters(); + return; + } + memcpy(g_ptr + g_nbChars, word, strlen(word)); + if (upCase) { + static const char toUp = 'A' - 'a'; + g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); + } + g_nbChars += strlen(word); + memcpy(g_ptr + g_nbChars, separator, strlen(separator)); + g_nbChars += strlen(separator); +} + +static int about(unsigned target) { + return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); +} + +/* Function to generate a random sentence */ +static void generateSentence(int nbWords) { + int commaPos = about(9); + int comma2 = commaPos + about(7); + int i; + for (i = 0; i < nbWords; i++) { + int const wordID = distrib[LOREM_rand(distribCount)]; + const char *const word = words[wordID]; + const char* sep = " "; + if (i == commaPos) + sep = ", "; + if (i == comma2) + sep = ", "; + if (i == nbWords - 1) + sep = ". "; + generateWord(word, sep, i==0); + } +} + +static void generateParagraph(int nbSentences) { + int i; + for (i = 0; i < nbSentences; i++) { + int wordsPerSentence = about(8); + generateSentence(wordsPerSentence); + } + if (g_nbChars < g_maxChars) { + g_ptr[g_nbChars++] = '\n'; + } + if (g_nbChars < g_maxChars) { + g_ptr[g_nbChars++] = '\n'; + } +} + +/* It's "common" for lorem ipsum generators to start with the same first + * pre-defined sentence */ +static void generateFirstSentence(void) { + int i; + for (i = 0; i < 18; i++) { + const char *word = words[i]; + const char *separator = " "; + if (i == 4) + separator = ", "; + if (i == 7) + separator = ", "; + generateWord(word, separator, i==0); + } + generateWord(words[18], ". ", 0); +} + +size_t LOREM_genBlock(void* buffer, size_t size, + unsigned seed, + int first, int fill) +{ + g_ptr = (char*)buffer; + assert(size < INT_MAX); + g_maxChars = size; + g_nbChars = 0; + g_randRoot = seed; + if (first) { + generateFirstSentence(); + } + while (g_nbChars < g_maxChars) { + int sentencePerParagraph = about(7); + generateParagraph(sentencePerParagraph); + if (!fill) + break; /* only generate one paragraph in not-fill mode */ + } + g_ptr = NULL; + return g_nbChars; +} + +void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) +{ + LOREM_genBlock(buffer, size, seed, 1, 1); +} + diff --git a/programs/lorem.h b/programs/lorem.h new file mode 100644 index 000000000..4a87f8748 --- /dev/null +++ b/programs/lorem.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* lorem ipsum generator */ + +#include /* size_t */ + +/* + * LOREM_genBuffer(): + * Generate @size bytes of compressible data using lorem ipsum generator + * into provided @buffer. + */ +void LOREM_genBuffer(void* buffer, size_t size, unsigned seed); + +/* + * LOREM_genBlock(): + * Similar to LOREM_genBuffer, with additional controls : + * - @first : generate the first sentence + * - @fill : fill the entire @buffer, + * if ==0: generate one paragraph at most. + * @return : nb of bytes generated into @buffer. + */ +size_t LOREM_genBlock(void* buffer, size_t size, + unsigned seed, + int first, int fill); diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 3f0ae8bdd..dd21021b0 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -856,7 +856,7 @@ int main(int argCount, const char* argv[]) ZSTD_paramSwitch_e useRowMatchFinder = ZSTD_ps_auto; FIO_compressionType_t cType = FIO_zstdCompression; unsigned nbWorkers = 0; - double compressibility = 0.5; + double compressibility = -1.0; /* lorem ipsum generator */ unsigned bench_nbSeconds = 3; /* would be better if this value was synchronized from bench */ size_t blockSize = 0; @@ -1280,7 +1280,7 @@ int main(int argCount, const char* argv[]) break; /* unknown command */ - default : + default : sprintf(shortArgument, "-%c", argument[0]); badUsage(programName, shortArgument); CLEAN_RETURN(1);