diff --git a/.travis.yml b/.travis.yml
index 36df67052..a61b82876 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,23 +20,25 @@ matrix:
     - env: Cmd='make gcc8install && CC=gcc-8 CFLAGS="-Werror -O3" make -j all'
     - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-test-zstd'
 
+    - env: Cmd='make staticAnalyze'
+
     - env: Cmd='make gcc6install && CC=gcc-6 make clean uasan-fuzztest'
     - env: Cmd='make gcc6install libc6install
              && make clean && CC=gcc-6 CFLAGS=-m32 make uasan-fuzztest'
     - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-fuzztest'
     - env: Cmd='make clang38install && CC=clang-3.8 make clean tsan-test-zstream'
 
-    - env: Cmd='make arminstall && make armfuzz'
-    - env: Cmd='make arminstall && make aarch64fuzz'
-    - env: Cmd='make ppcinstall && make ppcfuzz'
-    - env: Cmd='make ppcinstall && make ppc64fuzz'
-
     - env: Cmd='make -j uasanregressiontest
              && make clean && make -j msanregressiontest'
 
     - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest
              && make clean && make -C tests test-fuzzer-stackmode'
 
+    - env: Cmd='make arminstall && make armfuzz'
+    - env: Cmd='make arminstall && make aarch64fuzz'
+    - env: Cmd='make ppcinstall && make ppcfuzz'
+    - env: Cmd='make ppcinstall && make ppc64fuzz'
+
     - env: Cmd='make lz4install && make -C tests test-lz4
              && make clean && make -C tests test-pool
              && make clean && bash tests/libzstd_partial_builds.sh'
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..0f7ad8bfc
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/Makefile b/Makefile
index 6c4ab9c9b..a17900450 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ else
 EXT =
 endif
 
+## default: Build lib-release and zstd-release
 .PHONY: default
 default: lib-release zstd-release
 
@@ -30,10 +31,9 @@ default: lib-release zstd-release
 all: allmost examples manual contrib
 
 .PHONY: allmost
-allmost: allzstd
-	$(MAKE) -C $(ZWRAPDIR) all
+allmost: allzstd zlibwrapper
 
-#skip zwrapper, can't build that on alternate architectures without the proper zlib installed
+# skip zwrapper, can't build that on alternate architectures without the proper zlib installed
 .PHONY: allzstd
 allzstd: lib
 	$(MAKE) -C $(PRGDIR) all
@@ -44,8 +44,8 @@ all32:
 	$(MAKE) -C $(PRGDIR) zstd32
 	$(MAKE) -C $(TESTDIR) all32
 
-.PHONY: lib lib-release
-lib lib-release:
+.PHONY: lib lib-release libzstd.a
+lib lib-release :
 	@$(MAKE) -C $(ZSTDDIR) $@
 
 .PHONY: zstd zstd-release
@@ -59,8 +59,8 @@ zstdmt:
 	cp $(PRGDIR)/zstd$(EXT) ./zstdmt$(EXT)
 
 .PHONY: zlibwrapper
-zlibwrapper:
-	$(MAKE) -C $(ZWRAPDIR) test
+zlibwrapper: lib
+	$(MAKE) -C $(ZWRAPDIR) all
 
 .PHONY: test
 test: MOREFLAGS += -g -DDEBUGLEVEL=1 -Werror
@@ -88,6 +88,7 @@ contrib: lib
 	$(MAKE) -C contrib/pzstd all
 	$(MAKE) -C contrib/seekable_format/examples all
 	$(MAKE) -C contrib/adaptive-compression all
+	$(MAKE) -C contrib/largeNbDicts all
 
 .PHONY: cleanTabs
 cleanTabs:
@@ -104,6 +105,7 @@ clean:
 	@$(MAKE) -C contrib/pzstd $@ > $(VOID)
 	@$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
 	@$(MAKE) -C contrib/adaptive-compression $@ > $(VOID)
+	@$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
 	@$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
 	@$(RM) -r lz4
 	@echo Cleaning completed
@@ -114,11 +116,26 @@ clean:
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT))
 
 HOST_OS = POSIX
-CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release 
+CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release
 
+# Print a two column output of targets and their description. To add a target description, put a
+# comment in the Makefile with the format "## <TARGET>: <DESCRIPTION>".  For example:
+#
+## list: Print all targets and their descriptions (if provided)
 .PHONY: list
 list:
-	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+	@TARGETS=$$($(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null \
+		| awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' \
+		| egrep -v  -e '^[^[:alnum:]]' | sort); \
+	{ \
+	    printf "Target Name\tDescription\n"; \
+	    printf "%0.s-" {1..16}; printf "\t"; printf "%0.s-" {1..40}; printf "\n"; \
+	    for target in $$TARGETS; do \
+	        line=$$(egrep "^##[[:space:]]+$$target:" $(lastword $(MAKEFILE_LIST))); \
+	        description=$$(echo $$line | awk '{i=index($$0,":"); print substr($$0,i+1)}' | xargs); \
+	        printf "$$target\t$$description\n"; \
+	    done \
+	} | column -t -s $$'\t'
 
 .PHONY: install clangtest armtest usan asan uasan
 install:
@@ -198,7 +215,7 @@ gcc6test: clean
 
 clangtest: clean
 	clang -v
-	$(MAKE) all CXX=clang-++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
+	$(MAKE) all CXX=clang++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
 
 armtest: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
@@ -351,7 +368,10 @@ bmi32build: clean
 	$(CC) -v
 	CFLAGS="-O3 -mbmi -m32 -Werror" $(MAKE) -C $(TESTDIR) test
 
-staticAnalyze: clean
+# static analyzer test uses clang's scan-build
+# does not analyze zlibWrapper, due to detected issues in zlib source code
+staticAnalyze: SCANBUILD ?= scan-build
+staticAnalyze:
 	$(CC) -v
-	CPPFLAGS=-g scan-build --status-bugs -v $(MAKE) all
+	CC=$(CC) CPPFLAGS=-g $(SCANBUILD) --status-bugs -v $(MAKE) allzstd examples contrib
 endif
diff --git a/NEWS b/NEWS
index e3bfb242d..1553f5ad8 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,8 @@
+v1.3.6
+perf: much faster dictionary builder, by @jenniferliu
+api : reduced DDict size by 2 KB
+misc: tests/paramgrill, a parameter optimizer, by @GeorgeLu97
+
 v1.3.5
 perf: much faster dictionary compression, by @felixhandte
 perf: small quality improvement for dictionary generation, by @terrelln
diff --git a/build/VS2008/fuzzer/fuzzer.vcproj b/build/VS2008/fuzzer/fuzzer.vcproj
index f6ea1f855..4d444caef 100644
--- a/build/VS2008/fuzzer/fuzzer.vcproj
+++ b/build/VS2008/fuzzer/fuzzer.vcproj
@@ -336,6 +336,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\cover.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
@@ -482,6 +486,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\zdict.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
diff --git a/build/VS2008/zstd/zstd.vcproj b/build/VS2008/zstd/zstd.vcproj
index 5d9f68327..46d7c85dd 100644
--- a/build/VS2008/zstd/zstd.vcproj
+++ b/build/VS2008/zstd/zstd.vcproj
@@ -348,6 +348,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\cover.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
@@ -522,6 +526,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\zdict.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
diff --git a/build/VS2008/zstdlib/zstdlib.vcproj b/build/VS2008/zstdlib/zstdlib.vcproj
index 7234b02ac..950a6f39c 100644
--- a/build/VS2008/zstdlib/zstdlib.vcproj
+++ b/build/VS2008/zstdlib/zstdlib.vcproj
@@ -332,6 +332,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\cover.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\fastcover.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\divsufsort.c"
 				>
@@ -502,6 +506,10 @@
 				RelativePath="..\..\..\lib\dictBuilder\zdict.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\dictBuilder\cover.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\dictBuilder\zdict_static.h"
 				>
diff --git a/build/VS2010/fuzzer/fuzzer.vcxproj b/build/VS2010/fuzzer/fuzzer.vcxproj
index f0d1ab066..6077cd2c1 100644
--- a/build/VS2010/fuzzer/fuzzer.vcxproj
+++ b/build/VS2010/fuzzer/fuzzer.vcxproj
@@ -176,6 +176,7 @@
     <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
     <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\programs\datagen.c" />
@@ -199,6 +200,7 @@
     <ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
+    <ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
     <ClInclude Include="..\..\..\lib\legacy\zstd_legacy.h" />
     <ClInclude Include="..\..\..\programs\datagen.h" />
     <ClInclude Include="..\..\..\programs\util.h" />
diff --git a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
index 92d518d3d..6e14e020e 100644
--- a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
+++ b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
@@ -43,6 +43,7 @@
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
diff --git a/build/VS2010/libzstd/libzstd.vcxproj b/build/VS2010/libzstd/libzstd.vcxproj
index c306fcec9..18f5cb533 100644
--- a/build/VS2010/libzstd/libzstd.vcxproj
+++ b/build/VS2010/libzstd/libzstd.vcxproj
@@ -43,6 +43,7 @@
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_compress.c" />
     <ClCompile Include="..\..\..\lib\deprecated\zbuff_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
diff --git a/build/VS2010/zstd/zstd.vcxproj b/build/VS2010/zstd/zstd.vcxproj
index 4af281326..936960b58 100644
--- a/build/VS2010/zstd/zstd.vcxproj
+++ b/build/VS2010/zstd/zstd.vcxproj
@@ -40,6 +40,7 @@
     <ClCompile Include="..\..\..\lib\decompress\huf_decompress.c" />
     <ClCompile Include="..\..\..\lib\decompress\zstd_decompress.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\cover.c" />
+    <ClCompile Include="..\..\..\lib\dictBuilder\fastcover.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\divsufsort.c" />
     <ClCompile Include="..\..\..\lib\dictBuilder\zdict.c" />
     <ClCompile Include="..\..\..\lib\legacy\zstd_v01.c" />
@@ -61,6 +62,7 @@
     <ClInclude Include="..\..\..\lib\common\xxhash.h" />
     <ClInclude Include="..\..\..\lib\compress\zstdmt_compress.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\zdict.h" />
+    <ClInclude Include="..\..\..\lib\dictBuilder\cover.h" />
     <ClInclude Include="..\..\..\lib\dictBuilder\divsufsort.h" />
     <ClInclude Include="..\..\..\lib\common\fse.h" />
     <ClInclude Include="..\..\..\lib\common\huf.h" />
diff --git a/build/cmake/contrib/gen_html/CMakeLists.txt b/build/cmake/contrib/gen_html/CMakeLists.txt
index 958e9d173..c4062d475 100644
--- a/build/cmake/contrib/gen_html/CMakeLists.txt
+++ b/build/cmake/contrib/gen_html/CMakeLists.txt
@@ -27,4 +27,4 @@ ADD_CUSTOM_TARGET(zstd_manual.html ALL
                   ${GENHTML_BINARY} "${LIBVERSION}" "${LIBRARY_DIR}/zstd.h" "${PROJECT_BINARY_DIR}/zstd_manual.html"
                   DEPENDS gen_html COMMENT "Update zstd manual")
 
-INSTALL(FILES "${PROJECT_BINARY_DIR}/zstd_manual.html" DESTINATION "${CMAKE_INSTALL_PREFIX}/${DOC_INSTALL_DIR}")
+INSTALL(FILES "${PROJECT_BINARY_DIR}/zstd_manual.html" DESTINATION "${CMAKE_INSTALL_DOCDIR}")
diff --git a/build/cmake/lib/CMakeLists.txt b/build/cmake/lib/CMakeLists.txt
index e84e06301..ffc196ddc 100644
--- a/build/cmake/lib/CMakeLists.txt
+++ b/build/cmake/lib/CMakeLists.txt
@@ -47,6 +47,7 @@ SET(Sources
         ${LIBRARY_DIR}/decompress/huf_decompress.c
         ${LIBRARY_DIR}/decompress/zstd_decompress.c
         ${LIBRARY_DIR}/dictBuilder/cover.c
+        ${LIBRARY_DIR}/dictBuilder/fastcover.c
         ${LIBRARY_DIR}/dictBuilder/divsufsort.c
         ${LIBRARY_DIR}/dictBuilder/zdict.c
         ${LIBRARY_DIR}/deprecated/zbuff_common.c
@@ -74,6 +75,7 @@ SET(Headers
         ${LIBRARY_DIR}/compress/zstd_ldm.h
         ${LIBRARY_DIR}/compress/zstdmt_compress.h
         ${LIBRARY_DIR}/dictBuilder/zdict.h
+        ${LIBRARY_DIR}/dictBuilder/cover.h
         ${LIBRARY_DIR}/deprecated/zbuff.h)
 
 IF (ZSTD_LEGACY_SUPPORT)
@@ -178,6 +180,7 @@ INSTALL(FILES
     ${LIBRARY_DIR}/zstd.h
     ${LIBRARY_DIR}/deprecated/zbuff.h
     ${LIBRARY_DIR}/dictBuilder/zdict.h
+    ${LIBRARY_DIR}/dictBuilder/cover.h
     ${LIBRARY_DIR}/common/zstd_errors.h
     DESTINATION "include")
 
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
index 681494888..72ce04f2a 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
@@ -2,10 +2,9 @@ ARG :=
 
 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../randomDictBuilder -I ../fastCover -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
+INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
 RANDOM_FILE := ../randomDictBuilder/random.c
-FAST_FILE := ../fastCover/fastCover.c
 IO_FILE := ../randomDictBuilder/io.c
 
 all: run clean
@@ -22,8 +21,8 @@ test: benchmarkTest clean
 benchmarkTest: benchmark test.sh
 	sh test.sh
 
-benchmark: benchmark.o io.o random.o fastCover.o libzstd.a
-	$(CC) $(CFLAGS) benchmark.o io.o random.o fastCover.o libzstd.a -o benchmark
+benchmark: benchmark.o io.o random.o libzstd.a
+	$(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark
 
 benchmark.o: benchmark.c
 	$(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c
@@ -31,9 +30,6 @@ benchmark.o: benchmark.c
 random.o: $(RANDOM_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE)
 
-fastCover.o: $(FAST_FILE)
-	$(CC) $(CFLAGS) $(INCLUDES) -c $(FAST_FILE)
-
 io.o: $(IO_FILE)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
 
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
index 559776e2b..6a6c7f1d2 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
@@ -14,113 +14,836 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 
 ###Benchmarking Result:
 - First Cover is optimize cover, second Cover uses optimized d and k from first one.
-- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one.
+- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. This is run for accel values from 1 to 10.
 - Fourth column is chosen d and fifth column is chosen k
 
 github:
-NODICT       0.000025       2.999642        
-RANDOM       0.030101       8.791189        
-LEGACY       0.913108       8.173529        
-COVER       59.234160       10.652243        8          1298
-COVER       6.258459       10.652243        8          1298
-FAST15       9.959246       10.555630        8          1874
-FAST15       0.077719       10.555630        8          1874
-FAST16       10.028343       10.701698        8          1106
-FAST16       0.078117       10.701698        8          1106
-FAST17       10.567355       10.650652        8          1106
-FAST17       0.124833       10.650652        8          1106
-FAST18       11.795287       10.499142        8          1826
-FAST18       0.086992       10.499142        8          1826
-FAST19       13.132451       10.527140        8          1826
-FAST19       0.134716       10.527140        8          1826
-FAST20       14.366314       10.494710        8          1826
-FAST20       0.128844       10.494710        8          1826
-FAST21       14.941238       10.503488        8          1778
-FAST21       0.134975       10.503488        8          1778
-FAST22       15.146226       10.509284        8          1826
-FAST22       0.146918       10.509284        8          1826
-FAST23       16.260552       10.509284        8          1826
-FAST23       0.158494       10.509284        8          1826
-FAST24       16.806037       10.512369        8          1826
-FAST24       0.190464       10.512369        8          1826
+NODICT       0.000004       2.999642        
+RANDOM       0.024560       8.791189        
+LEGACY       0.727109       8.173529        
+COVER       40.565676       10.652243        8          1298
+COVER       3.608284       10.652243        8          1298
+FAST f=15 a=1       4.181024       10.570882        8          1154
+FAST f=15 a=1       0.040788       10.570882        8          1154
+FAST f=15 a=2       3.548352       10.574287        6          1970
+FAST f=15 a=2       0.035535       10.574287        6          1970
+FAST f=15 a=3       3.287364       10.613950        6          1010
+FAST f=15 a=3       0.032182       10.613950        6          1010
+FAST f=15 a=4       3.184976       10.573883        6          1058
+FAST f=15 a=4       0.029878       10.573883        6          1058
+FAST f=15 a=5       3.045513       10.580640        8          1154
+FAST f=15 a=5       0.022162       10.580640        8          1154
+FAST f=15 a=6       3.003296       10.583677        6          1010
+FAST f=15 a=6       0.028091       10.583677        6          1010
+FAST f=15 a=7       2.952655       10.622551        6          1106
+FAST f=15 a=7       0.02724       10.622551        6          1106
+FAST f=15 a=8       2.945674       10.614657        6          1010
+FAST f=15 a=8       0.027264       10.614657        6          1010
+FAST f=15 a=9       3.153439       10.564018        8          1154
+FAST f=15 a=9       0.020635       10.564018        8          1154
+FAST f=15 a=10       2.950416       10.511454        6          1010
+FAST f=15 a=10       0.026606       10.511454        6          1010
+FAST f=16 a=1       3.970029       10.681035        8          1154
+FAST f=16 a=1       0.038188       10.681035        8          1154
+FAST f=16 a=2       3.422892       10.484978        6          1874
+FAST f=16 a=2       0.034702       10.484978        6          1874
+FAST f=16 a=3       3.215836       10.632631        8          1154
+FAST f=16 a=3       0.026084       10.632631        8          1154
+FAST f=16 a=4       3.081353       10.626533        6          1106
+FAST f=16 a=4       0.030032       10.626533        6          1106
+FAST f=16 a=5       3.041241       10.545027        8          1922
+FAST f=16 a=5       0.022882       10.545027        8          1922
+FAST f=16 a=6       2.989390       10.638284        6          1874
+FAST f=16 a=6       0.028308       10.638284        6          1874
+FAST f=16 a=7       3.001581       10.797136        6          1106
+FAST f=16 a=7       0.027479       10.797136        6          1106
+FAST f=16 a=8       2.984107       10.658356        8          1058
+FAST f=16 a=8       0.021099       10.658356        8          1058
+FAST f=16 a=9       2.925788       10.523869        6          1010
+FAST f=16 a=9       0.026905       10.523869        6          1010
+FAST f=16 a=10       2.889605       10.745841        6          1874
+FAST f=16 a=10       0.026846       10.745841        6          1874
+FAST f=17 a=1       4.031953       10.672080        8          1202
+FAST f=17 a=1       0.040658       10.672080        8          1202
+FAST f=17 a=2       3.458107       10.589352        8          1106
+FAST f=17 a=2       0.02926       10.589352        8          1106
+FAST f=17 a=3       3.291189       10.662714        8          1154
+FAST f=17 a=3       0.026531       10.662714        8          1154
+FAST f=17 a=4       3.154950       10.549456        8          1346
+FAST f=17 a=4       0.024991       10.549456        8          1346
+FAST f=17 a=5       3.092271       10.541670        6          1202
+FAST f=17 a=5       0.038285       10.541670        6          1202
+FAST f=17 a=6       3.166146       10.729112        6          1874
+FAST f=17 a=6       0.038217       10.729112        6          1874
+FAST f=17 a=7       3.035467       10.810485        6          1106
+FAST f=17 a=7       0.036655       10.810485        6          1106
+FAST f=17 a=8       3.035668       10.530532        6          1058
+FAST f=17 a=8       0.037715       10.530532        6          1058
+FAST f=17 a=9       2.987917       10.589802        8          1922
+FAST f=17 a=9       0.02217       10.589802        8          1922
+FAST f=17 a=10       2.981647       10.722579        8          1106
+FAST f=17 a=10       0.021948       10.722579        8          1106
+FAST f=18 a=1       4.067144       10.634943        8          1154
+FAST f=18 a=1       0.041386       10.634943        8          1154
+FAST f=18 a=2       3.507377       10.546230        6          1970
+FAST f=18 a=2       0.037572       10.546230        6          1970
+FAST f=18 a=3       3.323015       10.648061        8          1154
+FAST f=18 a=3       0.028306       10.648061        8          1154
+FAST f=18 a=4       3.216735       10.705402        6          1010
+FAST f=18 a=4       0.030755       10.705402        6          1010
+FAST f=18 a=5       3.175794       10.588154        8          1874
+FAST f=18 a=5       0.025315       10.588154        8          1874
+FAST f=18 a=6       3.127459       10.751104        8          1106
+FAST f=18 a=6       0.023897       10.751104        8          1106
+FAST f=18 a=7       3.083017       10.780402        6          1106
+FAST f=18 a=7       0.029158       10.780402        6          1106
+FAST f=18 a=8       3.069700       10.547226        8          1346
+FAST f=18 a=8       0.024046       10.547226        8          1346
+FAST f=18 a=9       3.056591       10.674759        6          1010
+FAST f=18 a=9       0.028496       10.674759        6          1010
+FAST f=18 a=10       3.063588       10.737578        8          1106
+FAST f=18 a=10       0.023033       10.737578        8          1106
+FAST f=19 a=1       4.164041       10.650333        8          1154
+FAST f=19 a=1       0.042906       10.650333        8          1154
+FAST f=19 a=2       3.585409       10.577066        6          1058
+FAST f=19 a=2       0.038994       10.577066        6          1058
+FAST f=19 a=3       3.439643       10.639403        8          1154
+FAST f=19 a=3       0.028427       10.639403        8          1154
+FAST f=19 a=4       3.268869       10.554410        8          1298
+FAST f=19 a=4       0.026866       10.554410        8          1298
+FAST f=19 a=5       3.238225       10.615109        6          1010
+FAST f=19 a=5       0.03078       10.615109        6          1010
+FAST f=19 a=6       3.199558       10.609782        6          1874
+FAST f=19 a=6       0.030099       10.609782        6          1874
+FAST f=19 a=7       3.132395       10.794753        6          1106
+FAST f=19 a=7       0.028964       10.794753        6          1106
+FAST f=19 a=8       3.148446       10.554842        8          1298
+FAST f=19 a=8       0.024277       10.554842        8          1298
+FAST f=19 a=9       3.108324       10.668763        6          1010
+FAST f=19 a=9       0.02896       10.668763        6          1010
+FAST f=19 a=10       3.159863       10.757347        8          1106
+FAST f=19 a=10       0.023351       10.757347        8          1106
+FAST f=20 a=1       4.462698       10.661788        8          1154
+FAST f=20 a=1       0.047174       10.661788        8          1154
+FAST f=20 a=2       3.820269       10.678612        6          1106
+FAST f=20 a=2       0.040807       10.678612        6          1106
+FAST f=20 a=3       3.644955       10.648424        8          1154
+FAST f=20 a=3       0.031398       10.648424        8          1154
+FAST f=20 a=4       3.546257       10.559756        8          1298
+FAST f=20 a=4       0.029856       10.559756        8          1298
+FAST f=20 a=5       3.485248       10.646637        6          1010
+FAST f=20 a=5       0.033756       10.646637        6          1010
+FAST f=20 a=6       3.490438       10.775824        8          1106
+FAST f=20 a=6       0.028338       10.775824        8          1106
+FAST f=20 a=7       3.631289       10.801795        6          1106
+FAST f=20 a=7       0.035228       10.801795        6          1106
+FAST f=20 a=8       3.758936       10.545116        8          1346
+FAST f=20 a=8       0.027495       10.545116        8          1346
+FAST f=20 a=9       3.707024       10.677454        6          1010
+FAST f=20 a=9       0.031326       10.677454        6          1010
+FAST f=20 a=10       3.586593       10.756017        8          1106
+FAST f=20 a=10       0.027122       10.756017        8          1106
+FAST f=21 a=1       5.701396       10.655398        8          1154
+FAST f=21 a=1       0.067744       10.655398        8          1154
+FAST f=21 a=2       5.270542       10.650743        6          1106
+FAST f=21 a=2       0.052999       10.650743        6          1106
+FAST f=21 a=3       4.945294       10.652380        8          1154
+FAST f=21 a=3       0.052678       10.652380        8          1154
+FAST f=21 a=4       4.894079       10.543185        8          1298
+FAST f=21 a=4       0.04997       10.543185        8          1298
+FAST f=21 a=5       4.785417       10.630321        6          1010
+FAST f=21 a=5       0.045294       10.630321        6          1010
+FAST f=21 a=6       4.789381       10.664477        6          1874
+FAST f=21 a=6       0.046578       10.664477        6          1874
+FAST f=21 a=7       4.302955       10.805179        6          1106
+FAST f=21 a=7       0.041205       10.805179        6          1106
+FAST f=21 a=8       4.034630       10.551211        8          1298
+FAST f=21 a=8       0.040121       10.551211        8          1298
+FAST f=21 a=9       4.523868       10.799114        6          1010
+FAST f=21 a=9       0.043592       10.799114        6          1010
+FAST f=21 a=10       4.760736       10.750255        8          1106
+FAST f=21 a=10       0.043483       10.750255        8          1106
+FAST f=22 a=1       6.743064       10.640537        8          1154
+FAST f=22 a=1       0.086967       10.640537        8          1154
+FAST f=22 a=2       6.121739       10.626638        6          1970
+FAST f=22 a=2       0.066337       10.626638        6          1970
+FAST f=22 a=3       5.248851       10.640688        8          1154
+FAST f=22 a=3       0.054935       10.640688        8          1154
+FAST f=22 a=4       5.436579       10.588333        8          1298
+FAST f=22 a=4       0.064113       10.588333        8          1298
+FAST f=22 a=5       5.812815       10.652653        6          1010
+FAST f=22 a=5       0.058189       10.652653        6          1010
+FAST f=22 a=6       5.745472       10.666437        6          1874
+FAST f=22 a=6       0.057188       10.666437        6          1874
+FAST f=22 a=7       5.716393       10.806911        6          1106
+FAST f=22 a=7       0.056       10.806911        6          1106
+FAST f=22 a=8       5.698799       10.530784        8          1298
+FAST f=22 a=8       0.0583       10.530784        8          1298
+FAST f=22 a=9       5.710533       10.777391        6          1010
+FAST f=22 a=9       0.054945       10.777391        6          1010
+FAST f=22 a=10       5.685395       10.745023        8          1106
+FAST f=22 a=10       0.056526       10.745023        8          1106
+FAST f=23 a=1       7.836923       10.638828        8          1154
+FAST f=23 a=1       0.099522       10.638828        8          1154
+FAST f=23 a=2       6.627834       10.631061        6          1970
+FAST f=23 a=2       0.066769       10.631061        6          1970
+FAST f=23 a=3       5.602533       10.647288        8          1154
+FAST f=23 a=3       0.064513       10.647288        8          1154
+FAST f=23 a=4       6.005580       10.568747        8          1298
+FAST f=23 a=4       0.062022       10.568747        8          1298
+FAST f=23 a=5       5.481816       10.676921        6          1010
+FAST f=23 a=5       0.058959       10.676921        6          1010
+FAST f=23 a=6       5.460444       10.666194        6          1874
+FAST f=23 a=6       0.057687       10.666194        6          1874
+FAST f=23 a=7       5.659822       10.800377        6          1106
+FAST f=23 a=7       0.06783       10.800377        6          1106
+FAST f=23 a=8       6.826940       10.522167        8          1298
+FAST f=23 a=8       0.070533       10.522167        8          1298
+FAST f=23 a=9       6.804757       10.577799        8          1682
+FAST f=23 a=9       0.069949       10.577799        8          1682
+FAST f=23 a=10       6.774933       10.742093        8          1106
+FAST f=23 a=10       0.068395       10.742093        8          1106
+FAST f=24 a=1       8.444110       10.632783        8          1154
+FAST f=24 a=1       0.094357       10.632783        8          1154
+FAST f=24 a=2       7.289578       10.631061        6          1970
+FAST f=24 a=2       0.098515       10.631061        6          1970
+FAST f=24 a=3       8.619780       10.646289        8          1154
+FAST f=24 a=3       0.098041       10.646289        8          1154
+FAST f=24 a=4       8.508455       10.555199        8          1298
+FAST f=24 a=4       0.093885       10.555199        8          1298
+FAST f=24 a=5       8.471145       10.674363        6          1010
+FAST f=24 a=5       0.088676       10.674363        6          1010
+FAST f=24 a=6       8.426727       10.667228        6          1874
+FAST f=24 a=6       0.087247       10.667228        6          1874
+FAST f=24 a=7       8.356826       10.803027        6          1106
+FAST f=24 a=7       0.085835       10.803027        6          1106
+FAST f=24 a=8       6.756811       10.522049        8          1298
+FAST f=24 a=8       0.07107       10.522049        8          1298
+FAST f=24 a=9       6.548169       10.571882        8          1682
+FAST f=24 a=9       0.0713       10.571882        8          1682
+FAST f=24 a=10       8.238079       10.736453        8          1106
+FAST f=24 a=10       0.07004       10.736453        8          1106
+
 
 hg-commands:
-NODICT       0.000026       2.425291        
-RANDOM       0.046270       3.490331        
-LEGACY       0.847904       3.911682        
-COVER       71.691804       4.132653        8          386
-COVER       3.187085       4.132653        8          386
-FAST15       11.593687       3.920720        6          1106
-FAST15       0.082431       3.920720        6          1106
-FAST16       11.775958       4.033306        8          674
-FAST16       0.092587       4.033306        8          674
-FAST17       11.965064       4.064132        8          1490
-FAST17       0.106382       4.064132        8          1490
-FAST18       11.438197       4.086714        8          290
-FAST18       0.097293       4.086714        8          290
-FAST19       12.292512       4.097947        8          578
-FAST19       0.104406       4.097947        8          578
-FAST20       13.857857       4.102851        8          434
-FAST20       0.139467       4.102851        8          434
-FAST21       14.599613       4.105350        8          530
-FAST21       0.189416       4.105350        8          530
-FAST22       15.966109       4.104100        8          530
-FAST22       0.183817       4.104100        8          530
-FAST23       18.033645       4.098110        8          914
-FAST23       0.246641       4.098110        8          914
-FAST24       22.992891       4.117367        8          722
-FAST24       0.285994       4.117367        8          722
+NODICT       0.000005       2.425276        
+RANDOM       0.046332       3.490331        
+LEGACY       0.720351       3.911682        
+COVER       45.507731       4.132653        8          386
+COVER       1.868810       4.132653        8          386
+FAST f=15 a=1       4.561427       3.866894        8          1202
+FAST f=15 a=1       0.048946       3.866894        8          1202
+FAST f=15 a=2       3.574462       3.892119        8          1538
+FAST f=15 a=2       0.033677       3.892119        8          1538
+FAST f=15 a=3       3.230227       3.888791        6          1346
+FAST f=15 a=3       0.034312       3.888791        6          1346
+FAST f=15 a=4       3.042388       3.899739        8          1010
+FAST f=15 a=4       0.024307       3.899739        8          1010
+FAST f=15 a=5       2.800148       3.896220        8          818
+FAST f=15 a=5       0.022331       3.896220        8          818
+FAST f=15 a=6       2.706518       3.882039        8          578
+FAST f=15 a=6       0.020955       3.882039        8          578
+FAST f=15 a=7       2.701820       3.885430        6          866
+FAST f=15 a=7       0.026074       3.885430        6          866
+FAST f=15 a=8       2.604445       3.906932        8          1826
+FAST f=15 a=8       0.021789       3.906932        8          1826
+FAST f=15 a=9       2.598568       3.870324        6          1682
+FAST f=15 a=9       0.026004       3.870324        6          1682
+FAST f=15 a=10       2.575920       3.920783        8          1442
+FAST f=15 a=10       0.020228       3.920783        8          1442
+FAST f=16 a=1       4.630623       4.001430        8          770
+FAST f=16 a=1       0.047497       4.001430        8          770
+FAST f=16 a=2       3.674721       3.974431        8          1874
+FAST f=16 a=2       0.035761       3.974431        8          1874
+FAST f=16 a=3       3.338384       3.978703        8          1010
+FAST f=16 a=3       0.029436       3.978703        8          1010
+FAST f=16 a=4       3.004412       3.983035        8          1010
+FAST f=16 a=4       0.025744       3.983035        8          1010
+FAST f=16 a=5       2.881892       3.987710        8          770
+FAST f=16 a=5       0.023211       3.987710        8          770
+FAST f=16 a=6       2.807410       3.952717        8          1298
+FAST f=16 a=6       0.023199       3.952717        8          1298
+FAST f=16 a=7       2.819623       3.994627        8          770
+FAST f=16 a=7       0.021806       3.994627        8          770
+FAST f=16 a=8       2.740092       3.954032        8          1826
+FAST f=16 a=8       0.0226       3.954032        8          1826
+FAST f=16 a=9       2.682564       3.969879        6          1442
+FAST f=16 a=9       0.026324       3.969879        6          1442
+FAST f=16 a=10       2.657959       3.969755        8          674
+FAST f=16 a=10       0.020413       3.969755        8          674
+FAST f=17 a=1       4.729228       4.046000        8          530
+FAST f=17 a=1       0.049703       4.046000        8          530
+FAST f=17 a=2       3.764510       3.991519        8          1970
+FAST f=17 a=2       0.038195       3.991519        8          1970
+FAST f=17 a=3       3.416992       4.006296        6          914
+FAST f=17 a=3       0.036244       4.006296        6          914
+FAST f=17 a=4       3.145626       3.979182        8          1970
+FAST f=17 a=4       0.028676       3.979182        8          1970
+FAST f=17 a=5       2.995070       4.050070        8          770
+FAST f=17 a=5       0.025707       4.050070        8          770
+FAST f=17 a=6       2.911833       4.040024        8          770
+FAST f=17 a=6       0.02453       4.040024        8          770
+FAST f=17 a=7       2.894796       4.015884        8          818
+FAST f=17 a=7       0.023956       4.015884        8          818
+FAST f=17 a=8       2.789962       4.039303        8          530
+FAST f=17 a=8       0.023219       4.039303        8          530
+FAST f=17 a=9       2.787625       3.996762        8          1634
+FAST f=17 a=9       0.023651       3.996762        8          1634
+FAST f=17 a=10       2.754796       4.005059        8          1058
+FAST f=17 a=10       0.022537       4.005059        8          1058
+FAST f=18 a=1       4.779117       4.038214        8          242
+FAST f=18 a=1       0.048814       4.038214        8          242
+FAST f=18 a=2       3.829753       4.045768        8          722
+FAST f=18 a=2       0.036541       4.045768        8          722
+FAST f=18 a=3       3.495053       4.021497        8          770
+FAST f=18 a=3       0.032648       4.021497        8          770
+FAST f=18 a=4       3.221395       4.039623        8          770
+FAST f=18 a=4       0.027818       4.039623        8          770
+FAST f=18 a=5       3.059369       4.050414        8          530
+FAST f=18 a=5       0.026296       4.050414        8          530
+FAST f=18 a=6       3.019292       4.010714        6          962
+FAST f=18 a=6       0.031104       4.010714        6          962
+FAST f=18 a=7       2.949322       4.031439        6          770
+FAST f=18 a=7       0.030745       4.031439        6          770
+FAST f=18 a=8       2.876425       4.032088        6          386
+FAST f=18 a=8       0.027407       4.032088        6          386
+FAST f=18 a=9       2.850958       4.053372        8          674
+FAST f=18 a=9       0.023799       4.053372        8          674
+FAST f=18 a=10       2.884352       4.020148        8          1730
+FAST f=18 a=10       0.024401       4.020148        8          1730
+FAST f=19 a=1       4.815669       4.061203        8          674
+FAST f=19 a=1       0.051425       4.061203        8          674
+FAST f=19 a=2       3.951356       4.013822        8          1442
+FAST f=19 a=2       0.039968       4.013822        8          1442
+FAST f=19 a=3       3.554682       4.050425        8          722
+FAST f=19 a=3       0.032725       4.050425        8          722
+FAST f=19 a=4       3.242585       4.054677        8          722
+FAST f=19 a=4       0.028194       4.054677        8          722
+FAST f=19 a=5       3.105909       4.064524        8          818
+FAST f=19 a=5       0.02675       4.064524        8          818
+FAST f=19 a=6       3.059901       4.036857        8          1250
+FAST f=19 a=6       0.026396       4.036857        8          1250
+FAST f=19 a=7       3.016151       4.068234        6          770
+FAST f=19 a=7       0.031501       4.068234        6          770
+FAST f=19 a=8       2.962902       4.077509        8          530
+FAST f=19 a=8       0.023333       4.077509        8          530
+FAST f=19 a=9       2.899607       4.067328        8          530
+FAST f=19 a=9       0.024553       4.067328        8          530
+FAST f=19 a=10       2.950978       4.059901        8          434
+FAST f=19 a=10       0.023852       4.059901        8          434
+FAST f=20 a=1       5.259834       4.027579        8          1634
+FAST f=20 a=1       0.061123       4.027579        8          1634
+FAST f=20 a=2       4.382150       4.025093        8          1634
+FAST f=20 a=2       0.048009       4.025093        8          1634
+FAST f=20 a=3       4.104323       4.060842        8          530
+FAST f=20 a=3       0.040965       4.060842        8          530
+FAST f=20 a=4       3.853340       4.023504        6          914
+FAST f=20 a=4       0.041072       4.023504        6          914
+FAST f=20 a=5       3.728841       4.018089        6          1634
+FAST f=20 a=5       0.037469       4.018089        6          1634
+FAST f=20 a=6       3.683045       4.069138        8          578
+FAST f=20 a=6       0.028011       4.069138        8          578
+FAST f=20 a=7       3.726973       4.063160        8          722
+FAST f=20 a=7       0.028437       4.063160        8          722
+FAST f=20 a=8       3.555073       4.057690        8          386
+FAST f=20 a=8       0.027588       4.057690        8          386
+FAST f=20 a=9       3.551095       4.067253        8          482
+FAST f=20 a=9       0.025976       4.067253        8          482
+FAST f=20 a=10       3.490127       4.068518        8          530
+FAST f=20 a=10       0.025971       4.068518        8          530
+FAST f=21 a=1       7.343816       4.064945        8          770
+FAST f=21 a=1       0.085035       4.064945        8          770
+FAST f=21 a=2       5.930894       4.048206        8          386
+FAST f=21 a=2       0.067349       4.048206        8          386
+FAST f=21 a=3       6.770775       4.063417        8          578
+FAST f=21 a=3       0.077104       4.063417        8          578
+FAST f=21 a=4       6.889409       4.066761        8          626
+FAST f=21 a=4       0.0717       4.066761        8          626
+FAST f=21 a=5       6.714896       4.051813        8          914
+FAST f=21 a=5       0.071026       4.051813        8          914
+FAST f=21 a=6       6.539890       4.047263        8          1922
+FAST f=21 a=6       0.07127       4.047263        8          1922
+FAST f=21 a=7       6.511052       4.068373        8          482
+FAST f=21 a=7       0.065467       4.068373        8          482
+FAST f=21 a=8       6.458788       4.071597        8          482
+FAST f=21 a=8       0.063817       4.071597        8          482
+FAST f=21 a=9       6.377591       4.052905        8          434
+FAST f=21 a=9       0.063112       4.052905        8          434
+FAST f=21 a=10       6.360752       4.047773        8          530
+FAST f=21 a=10       0.063606       4.047773        8          530
+FAST f=22 a=1       10.523471       4.040812        8          962
+FAST f=22 a=1       0.14214       4.040812        8          962
+FAST f=22 a=2       9.454758       4.059396        8          914
+FAST f=22 a=2       0.118343       4.059396        8          914
+FAST f=22 a=3       9.043197       4.043019        8          1922
+FAST f=22 a=3       0.109798       4.043019        8          1922
+FAST f=22 a=4       8.716261       4.044819        8          770
+FAST f=22 a=4       0.099687       4.044819        8          770
+FAST f=22 a=5       8.529472       4.070576        8          530
+FAST f=22 a=5       0.093127       4.070576        8          530
+FAST f=22 a=6       8.424241       4.070565        8          722
+FAST f=22 a=6       0.093703       4.070565        8          722
+FAST f=22 a=7       8.403391       4.070591        8          578
+FAST f=22 a=7       0.089763       4.070591        8          578
+FAST f=22 a=8       8.285221       4.089171        8          530
+FAST f=22 a=8       0.087716       4.089171        8          530
+FAST f=22 a=9       8.282506       4.047470        8          722
+FAST f=22 a=9       0.089773       4.047470        8          722
+FAST f=22 a=10       8.241809       4.064151        8          818
+FAST f=22 a=10       0.090413       4.064151        8          818
+FAST f=23 a=1       12.389208       4.051635        6          530
+FAST f=23 a=1       0.147796       4.051635        6          530
+FAST f=23 a=2       11.300910       4.042835        6          914
+FAST f=23 a=2       0.133178       4.042835        6          914
+FAST f=23 a=3       10.879455       4.047415        8          626
+FAST f=23 a=3       0.129571       4.047415        8          626
+FAST f=23 a=4       10.522718       4.038269        6          914
+FAST f=23 a=4       0.118121       4.038269        6          914
+FAST f=23 a=5       10.348043       4.066884        8          434
+FAST f=23 a=5       0.112098       4.066884        8          434
+FAST f=23 a=6       10.238630       4.048635        8          1010
+FAST f=23 a=6       0.120281       4.048635        8          1010
+FAST f=23 a=7       10.213255       4.061809        8          530
+FAST f=23 a=7       0.1121       4.061809        8          530
+FAST f=23 a=8       10.107879       4.074104        8          818
+FAST f=23 a=8       0.116544       4.074104        8          818
+FAST f=23 a=9       10.063424       4.064811        8          674
+FAST f=23 a=9       0.109045       4.064811        8          674
+FAST f=23 a=10       10.035801       4.054918        8          530
+FAST f=23 a=10       0.108735       4.054918        8          530
+FAST f=24 a=1       14.963878       4.073490        8          722
+FAST f=24 a=1       0.206344       4.073490        8          722
+FAST f=24 a=2       13.833472       4.036100        8          962
+FAST f=24 a=2       0.17486       4.036100        8          962
+FAST f=24 a=3       13.404631       4.026281        6          1106
+FAST f=24 a=3       0.153961       4.026281        6          1106
+FAST f=24 a=4       13.041164       4.065448        8          674
+FAST f=24 a=4       0.155509       4.065448        8          674
+FAST f=24 a=5       12.879412       4.054636        8          674
+FAST f=24 a=5       0.148282       4.054636        8          674
+FAST f=24 a=6       12.773736       4.081376        8          530
+FAST f=24 a=6       0.142563       4.081376        8          530
+FAST f=24 a=7       12.711310       4.059834        8          770
+FAST f=24 a=7       0.149321       4.059834        8          770
+FAST f=24 a=8       12.635459       4.052050        8          1298
+FAST f=24 a=8       0.15095       4.052050        8          1298
+FAST f=24 a=9       12.558104       4.076516        8          722
+FAST f=24 a=9       0.144361       4.076516        8          722
+FAST f=24 a=10       10.661348       4.062137        8          818
+FAST f=24 a=10       0.108232       4.062137        8          818
+
 
 hg-changelog:
-NODICT       0.000007       1.377613        
-RANDOM       0.297345       2.097487        
-LEGACY       2.633992       2.058907        
-COVER       219.179786       2.189685        8          98
-COVER       6.620852       2.189685        8          98
-FAST15       47.635082       2.130794        6          386
-FAST15       0.321297       2.130794        6          386
-FAST16       43.837676       2.144845        8          194
-FAST16       0.312640       2.144845        8          194
-FAST17       49.349017       2.156099        8          242
-FAST17       0.348459       2.156099        8          242
-FAST18       51.153784       2.172439        6          98
-FAST18       0.353106       2.172439        6          98
-FAST19       52.627045       2.180321        6          98
-FAST19       0.390612       2.180321        6          98
-FAST20       63.748782       2.187431        6          98
-FAST20       0.489544       2.187431        6          98
-FAST21       68.709198       2.184185        6          146
-FAST21       0.530852       2.184185        6          146
-FAST22       68.491639       2.182830        6          98
-FAST22       0.645699       2.182830        6          98
-FAST23       72.558688       2.186399        8          98
-FAST23       0.593539       2.186399        8          98
-FAST24       76.137195       2.185608        6          98
-FAST24       0.680132       2.185608        6          98
+NODICT       0.000017       1.377590        
+RANDOM       0.186171       2.097487        
+LEGACY       1.670867       2.058907        
+COVER       173.561948       2.189685        8          98
+COVER       4.811180       2.189685        8          98
+FAST f=15 a=1       18.685906       2.129682        8          434
+FAST f=15 a=1       0.173376       2.129682        8          434
+FAST f=15 a=2       12.928259       2.131890        8          482
+FAST f=15 a=2       0.102582       2.131890        8          482
+FAST f=15 a=3       11.132343       2.128027        8          386
+FAST f=15 a=3       0.077122       2.128027        8          386
+FAST f=15 a=4       10.120683       2.125797        8          434
+FAST f=15 a=4       0.065175       2.125797        8          434
+FAST f=15 a=5       9.479092       2.127697        8          386
+FAST f=15 a=5       0.057905       2.127697        8          386
+FAST f=15 a=6       9.159523       2.127132        8          1682
+FAST f=15 a=6       0.058604       2.127132        8          1682
+FAST f=15 a=7       8.724003       2.129914        8          434
+FAST f=15 a=7       0.0493       2.129914        8          434
+FAST f=15 a=8       8.595001       2.127137        8          338
+FAST f=15 a=8       0.0474       2.127137        8          338
+FAST f=15 a=9       8.356405       2.125512        8          482
+FAST f=15 a=9       0.046126       2.125512        8          482
+FAST f=15 a=10       8.207111       2.126066        8          338
+FAST f=15 a=10       0.043292       2.126066        8          338
+FAST f=16 a=1       18.464436       2.144040        8          242
+FAST f=16 a=1       0.172156       2.144040        8          242
+FAST f=16 a=2       12.844825       2.148171        8          194
+FAST f=16 a=2       0.099619       2.148171        8          194
+FAST f=16 a=3       11.082568       2.140837        8          290
+FAST f=16 a=3       0.079165       2.140837        8          290
+FAST f=16 a=4       10.066749       2.144405        8          386
+FAST f=16 a=4       0.068411       2.144405        8          386
+FAST f=16 a=5       9.501121       2.140720        8          386
+FAST f=16 a=5       0.061316       2.140720        8          386
+FAST f=16 a=6       9.179332       2.139478        8          386
+FAST f=16 a=6       0.056322       2.139478        8          386
+FAST f=16 a=7       8.849438       2.142412        8          194
+FAST f=16 a=7       0.050493       2.142412        8          194
+FAST f=16 a=8       8.810919       2.143454        8          434
+FAST f=16 a=8       0.051304       2.143454        8          434
+FAST f=16 a=9       8.553900       2.140339        8          194
+FAST f=16 a=9       0.047285       2.140339        8          194
+FAST f=16 a=10       8.398027       2.143130        8          386
+FAST f=16 a=10       0.046386       2.143130        8          386
+FAST f=17 a=1       18.644657       2.157192        8          98
+FAST f=17 a=1       0.173884       2.157192        8          98
+FAST f=17 a=2       13.071242       2.159830        8          146
+FAST f=17 a=2       0.10388       2.159830        8          146
+FAST f=17 a=3       11.332366       2.153654        6          194
+FAST f=17 a=3       0.08983       2.153654        6          194
+FAST f=17 a=4       10.362413       2.156813        8          242
+FAST f=17 a=4       0.070389       2.156813        8          242
+FAST f=17 a=5       9.808159       2.155098        6          338
+FAST f=17 a=5       0.072661       2.155098        6          338
+FAST f=17 a=6       9.451165       2.153845        6          146
+FAST f=17 a=6       0.064959       2.153845        6          146
+FAST f=17 a=7       9.163097       2.155424        6          242
+FAST f=17 a=7       0.064323       2.155424        6          242
+FAST f=17 a=8       9.047276       2.156640        8          242
+FAST f=17 a=8       0.053382       2.156640        8          242
+FAST f=17 a=9       8.807671       2.152396        8          146
+FAST f=17 a=9       0.049617       2.152396        8          146
+FAST f=17 a=10       8.649827       2.152370        8          146
+FAST f=17 a=10       0.047849       2.152370        8          146
+FAST f=18 a=1       18.809502       2.168116        8          98
+FAST f=18 a=1       0.175226       2.168116        8          98
+FAST f=18 a=2       13.756502       2.170870        6          242
+FAST f=18 a=2       0.119507       2.170870        6          242
+FAST f=18 a=3       12.059748       2.163094        6          98
+FAST f=18 a=3       0.093912       2.163094        6          98
+FAST f=18 a=4       11.410294       2.172372        8          98
+FAST f=18 a=4       0.073048       2.172372        8          98
+FAST f=18 a=5       10.560297       2.166388        8          98
+FAST f=18 a=5       0.065136       2.166388        8          98
+FAST f=18 a=6       10.071390       2.162672        8          98
+FAST f=18 a=6       0.059402       2.162672        8          98
+FAST f=18 a=7       10.084214       2.166624        6          194
+FAST f=18 a=7       0.073276       2.166624        6          194
+FAST f=18 a=8       9.953226       2.167454        8          98
+FAST f=18 a=8       0.053659       2.167454        8          98
+FAST f=18 a=9       8.982461       2.161593        6          146
+FAST f=18 a=9       0.05955       2.161593        6          146
+FAST f=18 a=10       8.986092       2.164373        6          242
+FAST f=18 a=10       0.059135       2.164373        6          242
+FAST f=19 a=1       18.908277       2.176021        8          98
+FAST f=19 a=1       0.177316       2.176021        8          98
+FAST f=19 a=2       13.471313       2.176103        8          98
+FAST f=19 a=2       0.106344       2.176103        8          98
+FAST f=19 a=3       11.571406       2.172812        8          98
+FAST f=19 a=3       0.083293       2.172812        8          98
+FAST f=19 a=4       10.632775       2.177770        6          146
+FAST f=19 a=4       0.079864       2.177770        6          146
+FAST f=19 a=5       10.030190       2.175574        6          146
+FAST f=19 a=5       0.07223       2.175574        6          146
+FAST f=19 a=6       9.717818       2.169997        8          98
+FAST f=19 a=6       0.060049       2.169997        8          98
+FAST f=19 a=7       9.397531       2.172770        8          146
+FAST f=19 a=7       0.057188       2.172770        8          146
+FAST f=19 a=8       9.281061       2.175822        8          98
+FAST f=19 a=8       0.053711       2.175822        8          98
+FAST f=19 a=9       9.165242       2.169849        6          146
+FAST f=19 a=9       0.059898       2.169849        6          146
+FAST f=19 a=10       9.048763       2.173394        8          98
+FAST f=19 a=10       0.049757       2.173394        8          98
+FAST f=20 a=1       21.166917       2.183923        6          98
+FAST f=20 a=1       0.205425       2.183923        6          98
+FAST f=20 a=2       15.642753       2.182349        6          98
+FAST f=20 a=2       0.135957       2.182349        6          98
+FAST f=20 a=3       14.053730       2.173544        6          98
+FAST f=20 a=3       0.11266       2.173544        6          98
+FAST f=20 a=4       15.270019       2.183656        8          98
+FAST f=20 a=4       0.107892       2.183656        8          98
+FAST f=20 a=5       15.497927       2.174661        6          98
+FAST f=20 a=5       0.100305       2.174661        6          98
+FAST f=20 a=6       13.973505       2.172391        8          98
+FAST f=20 a=6       0.087565       2.172391        8          98
+FAST f=20 a=7       14.083296       2.172443        8          98
+FAST f=20 a=7       0.078062       2.172443        8          98
+FAST f=20 a=8       12.560048       2.175581        8          98
+FAST f=20 a=8       0.070282       2.175581        8          98
+FAST f=20 a=9       13.078645       2.173975        6          146
+FAST f=20 a=9       0.081041       2.173975        6          146
+FAST f=20 a=10       12.823328       2.177778        8          98
+FAST f=20 a=10       0.074522       2.177778        8          98
+FAST f=21 a=1       29.825370       2.183057        6          98
+FAST f=21 a=1       0.334453       2.183057        6          98
+FAST f=21 a=2       29.476474       2.182752        8          98
+FAST f=21 a=2       0.286602       2.182752        8          98
+FAST f=21 a=3       25.937186       2.175867        8          98
+FAST f=21 a=3       0.17626       2.175867        8          98
+FAST f=21 a=4       20.413865       2.179780        8          98
+FAST f=21 a=4       0.206085       2.179780        8          98
+FAST f=21 a=5       20.541889       2.178328        6          146
+FAST f=21 a=5       0.199157       2.178328        6          146
+FAST f=21 a=6       21.090670       2.174443        6          146
+FAST f=21 a=6       0.190645       2.174443        6          146
+FAST f=21 a=7       20.221569       2.177384        6          146
+FAST f=21 a=7       0.184278       2.177384        6          146
+FAST f=21 a=8       20.322357       2.179456        6          98
+FAST f=21 a=8       0.178458       2.179456        6          98
+FAST f=21 a=9       20.683912       2.174396        6          146
+FAST f=21 a=9       0.190829       2.174396        6          146
+FAST f=21 a=10       20.840865       2.174905        8          98
+FAST f=21 a=10       0.172515       2.174905        8          98
+FAST f=22 a=1       36.822827       2.181612        6          98
+FAST f=22 a=1       0.437389       2.181612        6          98
+FAST f=22 a=2       30.616902       2.183142        8          98
+FAST f=22 a=2       0.324284       2.183142        8          98
+FAST f=22 a=3       28.472482       2.178130        8          98
+FAST f=22 a=3       0.236538       2.178130        8          98
+FAST f=22 a=4       25.847028       2.181878        8          98
+FAST f=22 a=4       0.263744       2.181878        8          98
+FAST f=22 a=5       27.095881       2.180775        8          98
+FAST f=22 a=5       0.24988       2.180775        8          98
+FAST f=22 a=6       25.939172       2.170916        8          98
+FAST f=22 a=6       0.240033       2.170916        8          98
+FAST f=22 a=7       27.064194       2.177849        8          98
+FAST f=22 a=7       0.242383       2.177849        8          98
+FAST f=22 a=8       25.140221       2.178216        8          98
+FAST f=22 a=8       0.237601       2.178216        8          98
+FAST f=22 a=9       25.505283       2.177455        6          146
+FAST f=22 a=9       0.223217       2.177455        6          146
+FAST f=22 a=10       24.529362       2.176705        6          98
+FAST f=22 a=10       0.222876       2.176705        6          98
+FAST f=23 a=1       39.127310       2.183006        6          98
+FAST f=23 a=1       0.417338       2.183006        6          98
+FAST f=23 a=2       32.468161       2.183524        6          98
+FAST f=23 a=2       0.351645       2.183524        6          98
+FAST f=23 a=3       31.577620       2.172604        6          98
+FAST f=23 a=3       0.319659       2.172604        6          98
+FAST f=23 a=4       30.129247       2.183932        6          98
+FAST f=23 a=4       0.307239       2.183932        6          98
+FAST f=23 a=5       29.103376       2.183529        6          146
+FAST f=23 a=5       0.285533       2.183529        6          146
+FAST f=23 a=6       29.776045       2.174367        8          98
+FAST f=23 a=6       0.276846       2.174367        8          98
+FAST f=23 a=7       28.940407       2.178022        6          146
+FAST f=23 a=7       0.274082       2.178022        6          146
+FAST f=23 a=8       29.256009       2.179462        6          98
+FAST f=23 a=8       0.26949       2.179462        6          98
+FAST f=23 a=9       29.347312       2.170407        8          98
+FAST f=23 a=9       0.265034       2.170407        8          98
+FAST f=23 a=10       29.140081       2.171762        8          98
+FAST f=23 a=10       0.259183       2.171762        8          98
+FAST f=24 a=1       44.871179       2.182115        6          98
+FAST f=24 a=1       0.509433       2.182115        6          98
+FAST f=24 a=2       38.694867       2.180549        8          98
+FAST f=24 a=2       0.406695       2.180549        8          98
+FAST f=24 a=3       38.363769       2.172821        8          98
+FAST f=24 a=3       0.359581       2.172821        8          98
+FAST f=24 a=4       36.580797       2.184142        8          98
+FAST f=24 a=4       0.340614       2.184142        8          98
+FAST f=24 a=5       33.125701       2.183301        8          98
+FAST f=24 a=5       0.324874       2.183301        8          98
+FAST f=24 a=6       34.776068       2.173019        6          146
+FAST f=24 a=6       0.340397       2.173019        6          146
+FAST f=24 a=7       34.417625       2.176561        6          146
+FAST f=24 a=7       0.308223       2.176561        6          146
+FAST f=24 a=8       35.470291       2.182161        6          98
+FAST f=24 a=8       0.307724       2.182161        6          98
+FAST f=24 a=9       34.927252       2.172682        6          146
+FAST f=24 a=9       0.300598       2.172682        6          146
+FAST f=24 a=10       33.238355       2.173395        6          98
+FAST f=24 a=10       0.249916       2.173395        6          98
+
 
 hg-manifest:
-NODICT       0.000026       1.866385        
-RANDOM       0.784554       2.309436        
-LEGACY       10.193714       2.506977        
-COVER       988.206583       2.582528        8          434
-COVER       39.726199       2.582528        8          434
-FAST15       168.388819       2.392920        6          1826
-FAST15       1.272178       2.392920        6          1826
-FAST16       161.822607       2.480762        6          1922
-FAST16       1.164908       2.480762        6          1922
-FAST17       157.688544       2.548285        6          1682
-FAST17       1.222439       2.548285        6          1682
-FAST18       154.529585       2.567634        6          386
-FAST18       1.217596       2.567634        6          386
-FAST19       160.244979       2.581653        8          338
-FAST19       1.282450       2.581653        8          338
-FAST20       191.503297       2.586881        8          194
-FAST20       2.009748       2.586881        8          194
-FAST21       226.389709       2.590051        6          242
-FAST21       2.494543       2.590051        6          242
-FAST22       217.859055       2.591376        6          194
-FAST22       2.295693       2.591376        6          194
-FAST23       236.819791       2.591131        8          434
-FAST23       2.744711       2.591131        8          434
-FAST24       269.187800       2.591548        6          290
-FAST24       2.923671       2.591548        6          290
+NODICT       0.000004       1.866377        
+RANDOM       0.696346       2.309436        
+LEGACY       7.064527       2.506977        
+COVER       876.312865       2.582528        8          434
+COVER       35.684533       2.582528        8          434
+FAST f=15 a=1       76.618201       2.404013        8          1202
+FAST f=15 a=1       0.700722       2.404013        8          1202
+FAST f=15 a=2       49.213058       2.409248        6          1826
+FAST f=15 a=2       0.473393       2.409248        6          1826
+FAST f=15 a=3       41.753197       2.409677        8          1490
+FAST f=15 a=3       0.336848       2.409677        8          1490
+FAST f=15 a=4       38.648295       2.407996        8          1538
+FAST f=15 a=4       0.283952       2.407996        8          1538
+FAST f=15 a=5       36.144936       2.402895        8          1874
+FAST f=15 a=5       0.270128       2.402895        8          1874
+FAST f=15 a=6       35.484675       2.394873        8          1586
+FAST f=15 a=6       0.251637       2.394873        8          1586
+FAST f=15 a=7       34.280599       2.397311        8          1778
+FAST f=15 a=7       0.23984       2.397311        8          1778
+FAST f=15 a=8       32.122572       2.396089        6          1490
+FAST f=15 a=8       0.251508       2.396089        6          1490
+FAST f=15 a=9       29.909842       2.390092        6          1970
+FAST f=15 a=9       0.251233       2.390092        6          1970
+FAST f=15 a=10       30.102938       2.400086        6          1682
+FAST f=15 a=10       0.23688       2.400086        6          1682
+FAST f=16 a=1       67.750401       2.475460        6          1346
+FAST f=16 a=1       0.796035       2.475460        6          1346
+FAST f=16 a=2       52.812027       2.480860        6          1730
+FAST f=16 a=2       0.480384       2.480860        6          1730
+FAST f=16 a=3       44.179259       2.469304        8          1970
+FAST f=16 a=3       0.332657       2.469304        8          1970
+FAST f=16 a=4       37.612728       2.478208        6          1970
+FAST f=16 a=4       0.32498       2.478208        6          1970
+FAST f=16 a=5       35.056222       2.475568        6          1298
+FAST f=16 a=5       0.302824       2.475568        6          1298
+FAST f=16 a=6       34.713012       2.486079        8          1730
+FAST f=16 a=6       0.24755       2.486079        8          1730
+FAST f=16 a=7       33.713687       2.477180        6          1682
+FAST f=16 a=7       0.280358       2.477180        6          1682
+FAST f=16 a=8       31.571412       2.475418        8          1538
+FAST f=16 a=8       0.241241       2.475418        8          1538
+FAST f=16 a=9       31.608069       2.478263        8          1922
+FAST f=16 a=9       0.241764       2.478263        8          1922
+FAST f=16 a=10       31.358002       2.472263        8          1442
+FAST f=16 a=10       0.221661       2.472263        8          1442
+FAST f=17 a=1       66.185775       2.536085        6          1346
+FAST f=17 a=1       0.713549       2.536085        6          1346
+FAST f=17 a=2       50.365000       2.546105        8          1298
+FAST f=17 a=2       0.467846       2.546105        8          1298
+FAST f=17 a=3       42.712843       2.536250        8          1298
+FAST f=17 a=3       0.34047       2.536250        8          1298
+FAST f=17 a=4       39.514227       2.535555        8          1442
+FAST f=17 a=4       0.302989       2.535555        8          1442
+FAST f=17 a=5       35.189292       2.524925        8          1202
+FAST f=17 a=5       0.273451       2.524925        8          1202
+FAST f=17 a=6       35.791683       2.523466        8          1202
+FAST f=17 a=6       0.268261       2.523466        8          1202
+FAST f=17 a=7       37.416136       2.526625        6          1010
+FAST f=17 a=7       0.277558       2.526625        6          1010
+FAST f=17 a=8       37.084707       2.533274        6          1250
+FAST f=17 a=8       0.285104       2.533274        6          1250
+FAST f=17 a=9       34.183814       2.532765        8          1298
+FAST f=17 a=9       0.235133       2.532765        8          1298
+FAST f=17 a=10       31.149235       2.528722        8          1346
+FAST f=17 a=10       0.232679       2.528722        8          1346
+FAST f=18 a=1       72.942176       2.559857        6          386
+FAST f=18 a=1       0.718618       2.559857        6          386
+FAST f=18 a=2       51.690440       2.559572        8          290
+FAST f=18 a=2       0.403978       2.559572        8          290
+FAST f=18 a=3       45.344908       2.561040        8          962
+FAST f=18 a=3       0.357205       2.561040        8          962
+FAST f=18 a=4       39.804522       2.558446        8          1010
+FAST f=18 a=4       0.310526       2.558446        8          1010
+FAST f=18 a=5       38.134888       2.561811        8          626
+FAST f=18 a=5       0.273743       2.561811        8          626
+FAST f=18 a=6       35.091890       2.555518        8          722
+FAST f=18 a=6       0.260135       2.555518        8          722
+FAST f=18 a=7       34.639523       2.562938        8          290
+FAST f=18 a=7       0.234294       2.562938        8          290
+FAST f=18 a=8       36.076431       2.563567        8          1586
+FAST f=18 a=8       0.274075       2.563567        8          1586
+FAST f=18 a=9       36.376433       2.560950        8          722
+FAST f=18 a=9       0.240106       2.560950        8          722
+FAST f=18 a=10       32.624790       2.559340        8          578
+FAST f=18 a=10       0.234704       2.559340        8          578
+FAST f=19 a=1       70.513761       2.572441        8          194
+FAST f=19 a=1       0.726112       2.572441        8          194
+FAST f=19 a=2       59.263032       2.574560        8          482
+FAST f=19 a=2       0.451554       2.574560        8          482
+FAST f=19 a=3       51.509594       2.571546        6          194
+FAST f=19 a=3       0.393014       2.571546        6          194
+FAST f=19 a=4       55.393906       2.573386        8          482
+FAST f=19 a=4       0.38819       2.573386        8          482
+FAST f=19 a=5       43.201736       2.567589        8          674
+FAST f=19 a=5       0.292155       2.567589        8          674
+FAST f=19 a=6       42.911687       2.572666        6          434
+FAST f=19 a=6       0.303988       2.572666        6          434
+FAST f=19 a=7       44.687591       2.573613        6          290
+FAST f=19 a=7       0.308721       2.573613        6          290
+FAST f=19 a=8       37.372868       2.571039        6          194
+FAST f=19 a=8       0.287137       2.571039        6          194
+FAST f=19 a=9       36.074230       2.566473        6          482
+FAST f=19 a=9       0.280721       2.566473        6          482
+FAST f=19 a=10       33.731720       2.570306        8          194
+FAST f=19 a=10       0.224073       2.570306        8          194
+FAST f=20 a=1       79.670634       2.581146        6          290
+FAST f=20 a=1       0.899986       2.581146        6          290
+FAST f=20 a=2       58.827141       2.579782        8          386
+FAST f=20 a=2       0.602288       2.579782        8          386
+FAST f=20 a=3       51.289004       2.579627        8          722
+FAST f=20 a=3       0.446091       2.579627        8          722
+FAST f=20 a=4       47.711068       2.581508        8          722
+FAST f=20 a=4       0.473007       2.581508        8          722
+FAST f=20 a=5       47.402929       2.578062        6          434
+FAST f=20 a=5       0.497131       2.578062        6          434
+FAST f=20 a=6       54.797102       2.577365        8          482
+FAST f=20 a=6       0.515061       2.577365        8          482
+FAST f=20 a=7       51.370877       2.583050        8          386
+FAST f=20 a=7       0.402878       2.583050        8          386
+FAST f=20 a=8       51.437931       2.574875        6          242
+FAST f=20 a=8       0.453094       2.574875        6          242
+FAST f=20 a=9       44.105456       2.576700        6          242
+FAST f=20 a=9       0.456633       2.576700        6          242
+FAST f=20 a=10       44.447580       2.578305        8          338
+FAST f=20 a=10       0.409121       2.578305        8          338
+FAST f=21 a=1       113.031686       2.582449        6          242
+FAST f=21 a=1       1.456971       2.582449        6          242
+FAST f=21 a=2       97.700932       2.582124        8          194
+FAST f=21 a=2       1.072078       2.582124        8          194
+FAST f=21 a=3       96.563648       2.585479        8          434
+FAST f=21 a=3       0.949528       2.585479        8          434
+FAST f=21 a=4       90.597813       2.582366        6          386
+FAST f=21 a=4       0.76944       2.582366        6          386
+FAST f=21 a=5       86.815980       2.579043        8          434
+FAST f=21 a=5       0.858167       2.579043        8          434
+FAST f=21 a=6       91.235820       2.578378        8          530
+FAST f=21 a=6       0.684274       2.578378        8          530
+FAST f=21 a=7       84.392788       2.581243        8          386
+FAST f=21 a=7       0.814386       2.581243        8          386
+FAST f=21 a=8       82.052310       2.582547        8          338
+FAST f=21 a=8       0.822633       2.582547        8          338
+FAST f=21 a=9       74.696074       2.579319        8          194
+FAST f=21 a=9       0.811028       2.579319        8          194
+FAST f=21 a=10       76.211170       2.578766        8          290
+FAST f=21 a=10       0.809715       2.578766        8          290
+FAST f=22 a=1       138.976871       2.580478        8          194
+FAST f=22 a=1       1.748932       2.580478        8          194
+FAST f=22 a=2       120.164097       2.583633        8          386
+FAST f=22 a=2       1.333239       2.583633        8          386
+FAST f=22 a=3       111.986474       2.582566        6          194
+FAST f=22 a=3       1.305734       2.582566        6          194
+FAST f=22 a=4       108.548148       2.583068        6          194
+FAST f=22 a=4       1.314026       2.583068        6          194
+FAST f=22 a=5       103.173017       2.583495        6          290
+FAST f=22 a=5       1.228664       2.583495        6          290
+FAST f=22 a=6       108.421262       2.582349        8          530
+FAST f=22 a=6       1.076773       2.582349        8          530
+FAST f=22 a=7       103.284127       2.581022        8          386
+FAST f=22 a=7       1.112117       2.581022        8          386
+FAST f=22 a=8       96.330279       2.581073        8          290
+FAST f=22 a=8       1.109303       2.581073        8          290
+FAST f=22 a=9       97.651348       2.580075        6          194
+FAST f=22 a=9       0.933032       2.580075        6          194
+FAST f=22 a=10       101.660621       2.584886        8          194
+FAST f=22 a=10       0.796823       2.584886        8          194
+FAST f=23 a=1       159.322978       2.581474        6          242
+FAST f=23 a=1       2.015878       2.581474        6          242
+FAST f=23 a=2       134.331775       2.581619        8          194
+FAST f=23 a=2       1.545845       2.581619        8          194
+FAST f=23 a=3       127.724552       2.579888        6          338
+FAST f=23 a=3       1.444496       2.579888        6          338
+FAST f=23 a=4       126.077675       2.578137        6          242
+FAST f=23 a=4       1.364394       2.578137        6          242
+FAST f=23 a=5       124.914027       2.580843        8          338
+FAST f=23 a=5       1.116059       2.580843        8          338
+FAST f=23 a=6       122.874153       2.577637        6          338
+FAST f=23 a=6       1.164584       2.577637        6          338
+FAST f=23 a=7       123.099257       2.582715        6          386
+FAST f=23 a=7       1.354042       2.582715        6          386
+FAST f=23 a=8       122.026753       2.577681        8          194
+FAST f=23 a=8       1.210966       2.577681        8          194
+FAST f=23 a=9       121.164312       2.584599        6          290
+FAST f=23 a=9       1.174859       2.584599        6          290
+FAST f=23 a=10       117.462222       2.580358        8          194
+FAST f=23 a=10       1.075258       2.580358        8          194
+FAST f=24 a=1       169.539659       2.581642        6          194
+FAST f=24 a=1       1.916804       2.581642        6          194
+FAST f=24 a=2       160.539270       2.580421        6          290
+FAST f=24 a=2       1.71087       2.580421        6          290
+FAST f=24 a=3       155.455874       2.580449        6          242
+FAST f=24 a=3       1.60307       2.580449        6          242
+FAST f=24 a=4       147.630320       2.582953        6          338
+FAST f=24 a=4       1.396364       2.582953        6          338
+FAST f=24 a=5       133.767428       2.580589        6          290
+FAST f=24 a=5       1.19933       2.580589        6          290
+FAST f=24 a=6       146.437535       2.579453        8          194
+FAST f=24 a=6       1.385405       2.579453        8          194
+FAST f=24 a=7       147.227507       2.584155        8          386
+FAST f=24 a=7       1.48942       2.584155        8          386
+FAST f=24 a=8       138.005773       2.584115        8          194
+FAST f=24 a=8       1.352       2.584115        8          194
+FAST f=24 a=9       141.442625       2.582902        8          290
+FAST f=24 a=9       1.39647       2.582902        8          290
+FAST f=24 a=10       142.157446       2.582701        8          434
+FAST f=24 a=10       1.498889       2.582701        8          434
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index d92e8d5cb..b19345692 100644
--- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -5,7 +5,6 @@
 #include <ctype.h>
 #include <time.h>
 #include "random.h"
-#include "fastCover.h"
 #include "dictBuilder.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #include "io.h"
@@ -149,7 +148,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   /* Allocate dst with enough space to compress the maximum sized sample */
   {
     size_t maxSampleSize = 0;
-    for (int i = 0; i < srcInfo->nbSamples; i++) {
+    for (i = 0; i < srcInfo->nbSamples; i++) {
       maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize);
     }
     dstCapacity = ZSTD_compressBound(maxSampleSize);
@@ -291,6 +290,9 @@ int main(int argCount, const char* argv[])
   /* Initialize arguments to default values */
   unsigned k = 200;
   unsigned d = 8;
+  unsigned f;
+  unsigned accel;
+  unsigned i;
   const unsigned cLevel = DEFAULT_CLEVEL;
   const unsigned dictID = 0;
   const unsigned maxDictSize = g_defaultMaxDictSize;
@@ -305,7 +307,7 @@ int main(int argCount, const char* argv[])
   const char** extendedFileList = NULL;
 
   /* Parse arguments */
-  for (int i = 1; i < argCount; i++) {
+  for (i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "in=")) {
       filenameTable[filenameIdx] = argument;
@@ -375,6 +377,7 @@ int main(int argCount, const char* argv[])
 
   /* for cover */
   {
+    /* for cover (optimizing k and d) */
     ZDICT_cover_params_t coverParam;
     memset(&coverParam, 0, sizeof(coverParam));
     coverParam.zParams = zParams;
@@ -388,6 +391,7 @@ int main(int argCount, const char* argv[])
       goto _cleanup;
     }
 
+    /* for cover (with k and d provided) */
     const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
     DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
     if(coverResult) {
@@ -398,29 +402,34 @@ int main(int argCount, const char* argv[])
   }
 
   /* for fastCover */
-  for (unsigned f = 15; f < 25; f++){
+  for (f = 15; f < 25; f++){
     DISPLAYLEVEL(2, "current f is %u\n", f);
-    /* for fastCover (optimizing k and d) */
-    ZDICT_fastCover_params_t fastParam;
-    memset(&fastParam, 0, sizeof(fastParam));
-    fastParam.zParams = zParams;
-    fastParam.splitPoint = 1.0;
-    fastParam.f = f;
-    fastParam.steps = 40;
-    fastParam.nbThreads = 1;
-    const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-    DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-    if(fastOptResult) {
-      result = 1;
-      goto _cleanup;
-    }
+    for (accel = 1; accel < 11; accel++) {
+      DISPLAYLEVEL(2, "current accel is %u\n", accel);
+      /* for fastCover (optimizing k and d) */
+      ZDICT_fastCover_params_t fastParam;
+      memset(&fastParam, 0, sizeof(fastParam));
+      fastParam.zParams = zParams;
+      fastParam.f = f;
+      fastParam.steps = 40;
+      fastParam.nbThreads = 1;
+      fastParam.accel = accel;
+      const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+      DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+      if(fastOptResult) {
+        result = 1;
+        goto _cleanup;
+      }
 
-    /* for fastCover (with k and d provided) */
-    const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
-    DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
-    if(fastResult) {
-      result = 1;
-      goto _cleanup;
+      /* for fastCover (with k and d provided) */
+      for (i = 0; i < 5; i++) {
+        const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+        DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+        if(fastResult) {
+          result = 1;
+          goto _cleanup;
+        }
+      }
     }
   }
 
diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c
index 84d841b10..02c155a81 100644
--- a/contrib/experimental_dict_builders/fastCover/fastCover.c
+++ b/contrib/experimental_dict_builders/fastCover/fastCover.c
@@ -197,7 +197,7 @@ static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     bestSegment.end = newEnd;
   }
   {
-    /* Half the frequency of hash value of each dmer covered by the chosen segment. */
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
     U32 pos;
     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
       const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d);
@@ -300,7 +300,7 @@ static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
       totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
     DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
-                 (U32)(totalSamplesSize>>20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+                 (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
     return 0;
   }
   /* Check if there are at least 5 training samples */
diff --git a/contrib/largeNbDicts/.gitignore b/contrib/largeNbDicts/.gitignore
new file mode 100644
index 000000000..e77c4e496
--- /dev/null
+++ b/contrib/largeNbDicts/.gitignore
@@ -0,0 +1,2 @@
+# build artifacts
+largeNbDicts
diff --git a/contrib/largeNbDicts/Makefile b/contrib/largeNbDicts/Makefile
new file mode 100644
index 000000000..624140fab
--- /dev/null
+++ b/contrib/largeNbDicts/Makefile
@@ -0,0 +1,49 @@
+# ################################################################
+# Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under both the BSD-style license (found in the
+# LICENSE file in the root directory of this source tree) and the GPLv2 (found
+# in the COPYING file in the root directory of this source tree).
+# ################################################################
+
+PROGDIR = ../../programs
+LIBDIR  = ../../lib
+
+LIBZSTD = $(LIBDIR)/libzstd.a
+
+CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/common -I$(LIBDIR)/dictBuilder -I$(PROGDIR)
+
+CFLAGS  ?= -O3
+CFLAGS  += -std=gnu99
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+            -Wstrict-aliasing=1 -Wswitch-enum \
+            -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
+            -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
+            -Wredundant-decls
+CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
+
+
+default: largeNbDicts
+
+all : largeNbDicts
+
+largeNbDicts: bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
+
+.PHONY: $(LIBZSTD)
+$(LIBZSTD):
+	$(MAKE) -C $(LIBDIR) libzstd.a
+
+bench.o  : $(PROGDIR)/bench.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+datagen.o: $(PROGDIR)/datagen.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+xxhash.o : $(LIBDIR)/common/xxhash.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
+
+clean:
+	$(RM) *.o
+	$(RM) largeNbDicts
diff --git a/contrib/largeNbDicts/README.md b/contrib/largeNbDicts/README.md
new file mode 100644
index 000000000..f29bcdfe8
--- /dev/null
+++ b/contrib/largeNbDicts/README.md
@@ -0,0 +1,25 @@
+largeNbDicts
+=====================
+
+`largeNbDicts` is a benchmark test tool
+dedicated to the specific scenario of
+dictionary decompression using a very large number of dictionaries.
+When dictionaries are constantly changing, they are always "cold",
+suffering from increased latency due to cache misses.
+
+The tool is created in a bid to investigate performance for this scenario,
+and experiment mitigation techniques.
+
+Command line :
+```
+largeNbDicts [Options] filename(s)
+
+Options :
+-r           : recursively load all files in subdirectories (default: off)
+-B#          : split input into blocks of size # (default: no split)
+-#           : use compression level # (default: 3)
+-D #         : use # as a dictionary (default: create one)
+-i#          : nb benchmark rounds (default: 6)
+--nbDicts=#  : set nb of dictionaries to # (default: one per block)
+-h           : help (this text)
+```
diff --git a/contrib/largeNbDicts/largeNbDicts.c b/contrib/largeNbDicts/largeNbDicts.c
new file mode 100644
index 000000000..e0bc55380
--- /dev/null
+++ b/contrib/largeNbDicts/largeNbDicts.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* largeNbDicts
+ * This is a benchmark test tool
+ * dedicated to the specific case of dictionary decompression
+ * using a very large nb of dictionaries
+ * thus suffering latency from lots of cache misses.
+ * It's created in a bid to investigate performance and find optimizations. */
+
+
+/*---  Dependencies  ---*/
+
+#include <stddef.h>   /* size_t */
+#include <stdlib.h>   /* malloc, free, abort */
+#include <stdio.h>    /* fprintf */
+#include <assert.h>   /* assert */
+
+#include "util.h"
+#include "bench.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#include "zdict.h"
+
+
+/*---  Constants  --- */
+
+#define KB  *(1<<10)
+#define MB  *(1<<20)
+
+#define BLOCKSIZE_DEFAULT 0  /* no slicing into blocks */
+#define DICTSIZE  (4 KB)
+#define CLEVEL_DEFAULT 3
+
+#define BENCH_TIME_DEFAULT_S   6
+#define RUN_TIME_DEFAULT_MS    1000
+#define BENCH_TIME_DEFAULT_MS (BENCH_TIME_DEFAULT_S * RUN_TIME_DEFAULT_MS)
+
+#define DISPLAY_LEVEL_DEFAULT 3
+
+#define BENCH_SIZE_MAX (1200 MB)
+
+
+/*---  Macros  ---*/
+#define CONTROL(c)   { if (!(c)) abort(); }
+#undef MIN
+#define MIN(a,b)     ((a) < (b) ? (a) : (b))
+
+
+/*---  Display Macros  ---*/
+
+#define DISPLAY(...)         fprintf(stdout, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
+static int g_displayLevel = DISPLAY_LEVEL_DEFAULT;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+
+
+/*---  buffer_t  ---*/
+
+typedef struct {
+    void* ptr;
+    size_t size;
+    size_t capacity;
+} buffer_t;
+
+static const buffer_t kBuffNull = { NULL, 0, 0 };
+
+/* @return : kBuffNull if any error */
+static buffer_t createBuffer(size_t capacity)
+{
+    assert(capacity > 0);
+    void* const ptr = malloc(capacity);
+    if (ptr==NULL) return kBuffNull;
+
+    buffer_t buffer;
+    buffer.ptr = ptr;
+    buffer.capacity = capacity;
+    buffer.size = 0;
+    return buffer;
+}
+
+static void freeBuffer(buffer_t buff)
+{
+    free(buff.ptr);
+}
+
+
+static void fillBuffer_fromHandle(buffer_t* buff, FILE* f)
+{
+    size_t const readSize = fread(buff->ptr, 1, buff->capacity, f);
+    buff->size = readSize;
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_t createBuffer_fromFile(const char* fileName)
+{
+    U64 const fileSize = UTIL_getFileSize(fileName);
+    size_t const bufferSize = (size_t) fileSize;
+
+    if (fileSize == UTIL_FILESIZE_UNKNOWN) return kBuffNull;
+    assert((U64)bufferSize == fileSize);   /* check overflow */
+
+    {   FILE* const f = fopen(fileName, "rb");
+        if (f == NULL) return kBuffNull;
+
+        buffer_t buff = createBuffer(bufferSize);
+        CONTROL(buff.ptr != NULL);
+
+        fillBuffer_fromHandle(&buff, f);
+        CONTROL(buff.size == buff.capacity);
+
+        fclose(f);   /* do nothing specific if fclose() fails */
+        return buff;
+    }
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_t
+createDictionaryBuffer(const char* dictionaryName,
+                       const void* srcBuffer,
+                       const size_t* srcBlockSizes, unsigned nbBlocks,
+                       size_t requestedDictSize)
+{
+    if (dictionaryName) {
+        DISPLAYLEVEL(3, "loading dictionary %s \n", dictionaryName);
+        return createBuffer_fromFile(dictionaryName);  /* note : result might be kBuffNull */
+
+    } else {
+
+        DISPLAYLEVEL(3, "creating dictionary, of target size %u bytes \n",
+                        (unsigned)requestedDictSize);
+        void* const dictBuffer = malloc(requestedDictSize);
+        CONTROL(dictBuffer != NULL);
+
+        size_t const dictSize = ZDICT_trainFromBuffer(dictBuffer, requestedDictSize,
+                                                      srcBuffer,
+                                                      srcBlockSizes, nbBlocks);
+        CONTROL(!ZSTD_isError(dictSize));
+
+        buffer_t result;
+        result.ptr = dictBuffer;
+        result.capacity = requestedDictSize;
+        result.size = dictSize;
+        return result;
+    }
+}
+
+
+/*! BMK_loadFiles() :
+ *  Loads `buffer`, with content from files listed within `fileNamesTable`.
+ *  Fills `buffer` entirely.
+ * @return : 0 on success, !=0 on error */
+static int loadFiles(void* buffer, size_t bufferSize,
+                     size_t* fileSizes,
+                     const char* const * fileNamesTable, unsigned nbFiles)
+{
+    size_t pos = 0, totalSize = 0;
+
+    for (unsigned n=0; n<nbFiles; n++) {
+        U64 fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        if (UTIL_isDirectory(fileNamesTable[n])) {
+            fileSizes[n] = 0;
+            continue;
+        }
+        if (fileSize == UTIL_FILESIZE_UNKNOWN) {
+            fileSizes[n] = 0;
+            continue;
+        }
+
+        FILE* const f = fopen(fileNamesTable[n], "rb");
+        assert(f!=NULL);
+
+        assert(pos <= bufferSize);
+        assert(fileSize <= bufferSize - pos);
+
+        {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+            assert(readSize == fileSize);
+            pos += readSize;
+        }
+        fileSizes[n] = (size_t)fileSize;
+        totalSize += (size_t)fileSize;
+        fclose(f);
+    }
+
+    assert(totalSize == bufferSize);
+    return 0;
+}
+
+
+
+/*---  slice_collection_t  ---*/
+
+typedef struct {
+    void** slicePtrs;
+    size_t* capacities;
+    size_t nbSlices;
+} slice_collection_t;
+
+static const slice_collection_t kNullCollection = { NULL, NULL, 0 };
+
+static void freeSliceCollection(slice_collection_t collection)
+{
+    free(collection.slicePtrs);
+    free(collection.capacities);
+}
+
+/* shrinkSizes() :
+ * downsizes sizes of slices within collection, according to `newSizes`.
+ * every `newSizes` entry must be <= than its corresponding collection size */
+void shrinkSizes(slice_collection_t collection,
+                 const size_t* newSizes)  /* presumed same size as collection */
+{
+    size_t const nbSlices = collection.nbSlices;
+    for (size_t blockNb = 0; blockNb < nbSlices; blockNb++) {
+        assert(newSizes[blockNb] <= collection.capacities[blockNb]);
+        collection.capacities[blockNb] = newSizes[blockNb];
+    }
+}
+
+
+/* splitSlices() :
+ * nbSlices : if == 0, nbSlices is automatically determined from srcSlices and blockSize.
+ *            otherwise, creates exactly nbSlices slices,
+ *            by either truncating input (when smaller)
+ *            or repeating input from beginning */
+static slice_collection_t
+splitSlices(slice_collection_t srcSlices, size_t blockSize, size_t nbSlices)
+{
+    if (blockSize==0) blockSize = (size_t)(-1);   /* means "do not cut" */
+    size_t nbSrcBlocks = 0;
+    for (size_t ssnb=0; ssnb < srcSlices.nbSlices; ssnb++) {
+        size_t pos = 0;
+        while (pos <= srcSlices.capacities[ssnb]) {
+            nbSrcBlocks++;
+            pos += blockSize;
+        }
+    }
+
+    if (nbSlices == 0) nbSlices = nbSrcBlocks;
+
+    void** const sliceTable = (void**)malloc(nbSlices * sizeof(*sliceTable));
+    size_t* const capacities = (size_t*)malloc(nbSlices * sizeof(*capacities));
+    if (sliceTable == NULL || capacities == NULL) {
+        free(sliceTable);
+        free(capacities);
+        return kNullCollection;
+    }
+
+    size_t ssnb = 0;
+    for (size_t sliceNb=0; sliceNb < nbSlices; ) {
+        ssnb = (ssnb + 1) % srcSlices.nbSlices;
+        size_t pos = 0;
+        char* const ptr = (char*)srcSlices.slicePtrs[ssnb];
+        while (pos < srcSlices.capacities[ssnb] && sliceNb < nbSlices) {
+            size_t const size = MIN(blockSize, srcSlices.capacities[ssnb] - pos);
+            sliceTable[sliceNb] = ptr + pos;
+            capacities[sliceNb] = size;
+            sliceNb++;
+            pos += blockSize;
+        }
+    }
+
+    slice_collection_t result;
+    result.nbSlices = nbSlices;
+    result.slicePtrs = sliceTable;
+    result.capacities = capacities;
+    return result;
+}
+
+
+static size_t sliceCollection_totalCapacity(slice_collection_t sc)
+{
+    size_t totalSize = 0;
+    for (size_t n=0; n<sc.nbSlices; n++)
+        totalSize += sc.capacities[n];
+    return totalSize;
+}
+
+
+/* ---  buffer collection  --- */
+
+typedef struct {
+    buffer_t buffer;
+    slice_collection_t slices;
+} buffer_collection_t;
+
+
+static void freeBufferCollection(buffer_collection_t bc)
+{
+    freeBuffer(bc.buffer);
+    freeSliceCollection(bc.slices);
+}
+
+
+static buffer_collection_t
+createBufferCollection_fromSliceCollectionSizes(slice_collection_t sc)
+{
+    size_t const bufferSize = sliceCollection_totalCapacity(sc);
+
+    buffer_t buffer = createBuffer(bufferSize);
+    CONTROL(buffer.ptr != NULL);
+
+    size_t const nbSlices = sc.nbSlices;
+    void** const slices = (void**)malloc(nbSlices * sizeof(*slices));
+    CONTROL(slices != NULL);
+
+    size_t* const capacities = (size_t*)malloc(nbSlices * sizeof(*capacities));
+    CONTROL(capacities != NULL);
+
+    char* const ptr = (char*)buffer.ptr;
+    size_t pos = 0;
+    for (size_t n=0; n < nbSlices; n++) {
+        capacities[n] = sc.capacities[n];
+        slices[n] = ptr + pos;
+        pos += capacities[n];
+    }
+
+    buffer_collection_t result;
+    result.buffer = buffer;
+    result.slices.nbSlices = nbSlices;
+    result.slices.capacities = capacities;
+    result.slices.slicePtrs = slices;
+    return result;
+}
+
+
+/* @return : kBuffNull if any error */
+static buffer_collection_t
+createBufferCollection_fromFiles(const char* const * fileNamesTable, unsigned nbFiles)
+{
+    U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
+    assert(totalSizeToLoad != UTIL_FILESIZE_UNKNOWN);
+    assert(totalSizeToLoad <= BENCH_SIZE_MAX);
+    size_t const loadedSize = (size_t)totalSizeToLoad;
+    assert(loadedSize > 0);
+    void* const srcBuffer = malloc(loadedSize);
+    assert(srcBuffer != NULL);
+
+    assert(nbFiles > 0);
+    size_t* const fileSizes = (size_t*)calloc(nbFiles, sizeof(*fileSizes));
+    assert(fileSizes != NULL);
+
+    /* Load input buffer */
+    int const errorCode = loadFiles(srcBuffer, loadedSize,
+                                    fileSizes,
+                                    fileNamesTable, nbFiles);
+    assert(errorCode == 0);
+
+    void** sliceTable = (void**)malloc(nbFiles * sizeof(*sliceTable));
+    assert(sliceTable != NULL);
+
+    char* const ptr = (char*)srcBuffer;
+    size_t pos = 0;
+    unsigned fileNb = 0;
+    for ( ; (pos < loadedSize) && (fileNb < nbFiles); fileNb++) {
+        sliceTable[fileNb] = ptr + pos;
+        pos += fileSizes[fileNb];
+    }
+    assert(pos == loadedSize);
+    assert(fileNb == nbFiles);
+
+
+    buffer_t buffer;
+    buffer.ptr = srcBuffer;
+    buffer.capacity = loadedSize;
+    buffer.size = loadedSize;
+
+    slice_collection_t slices;
+    slices.slicePtrs = sliceTable;
+    slices.capacities = fileSizes;
+    slices.nbSlices = nbFiles;
+
+    buffer_collection_t bc;
+    bc.buffer = buffer;
+    bc.slices = slices;
+    return bc;
+}
+
+
+
+
+/*---  ddict_collection_t  ---*/
+
+typedef struct {
+    ZSTD_DDict** ddicts;
+    size_t nbDDict;
+} ddict_collection_t;
+
+static const ddict_collection_t kNullDDictCollection = { NULL, 0 };
+
+static void freeDDictCollection(ddict_collection_t ddictc)
+{
+    for (size_t dictNb=0; dictNb < ddictc.nbDDict; dictNb++) {
+        ZSTD_freeDDict(ddictc.ddicts[dictNb]);
+    }
+    free(ddictc.ddicts);
+}
+
+/* returns .buffers=NULL if operation fails */
+static ddict_collection_t createDDictCollection(const void* dictBuffer, size_t dictSize, size_t nbDDict)
+{
+    ZSTD_DDict** const ddicts = malloc(nbDDict * sizeof(ZSTD_DDict*));
+    assert(ddicts != NULL);
+    if (ddicts==NULL) return kNullDDictCollection;
+    for (size_t dictNb=0; dictNb < nbDDict; dictNb++) {
+        ddicts[dictNb] = ZSTD_createDDict(dictBuffer, dictSize);
+        assert(ddicts[dictNb] != NULL);
+    }
+    ddict_collection_t ddictc;
+    ddictc.ddicts = ddicts;
+    ddictc.nbDDict = nbDDict;
+    return ddictc;
+}
+
+
+/* mess with adresses, so that linear scanning dictionaries != linear address scanning */
+void shuffleDictionaries(ddict_collection_t dicts)
+{
+    size_t const nbDicts = dicts.nbDDict;
+    for (size_t r=0; r<nbDicts; r++) {
+        size_t const d = rand() % nbDicts;
+        ZSTD_DDict* tmpd = dicts.ddicts[d];
+        dicts.ddicts[d] = dicts.ddicts[r];
+        dicts.ddicts[r] = tmpd;
+    }
+    for (size_t r=0; r<nbDicts; r++) {
+        size_t const d1 = rand() % nbDicts;
+        size_t const d2 = rand() % nbDicts;
+        ZSTD_DDict* tmpd = dicts.ddicts[d1];
+        dicts.ddicts[d1] = dicts.ddicts[d2];
+        dicts.ddicts[d2] = tmpd;
+    }
+}
+
+
+/* ---   Compression  --- */
+
+/* compressBlocks() :
+ * @return : total compressed size of all blocks,
+ *        or 0 if error.
+ */
+static size_t compressBlocks(size_t* cSizes,   /* optional (can be NULL). If present, must contain at least nbBlocks fields */
+                             slice_collection_t dstBlockBuffers,
+                             slice_collection_t srcBlockBuffers,
+                             ZSTD_CDict* cdict, int cLevel)
+{
+    size_t const nbBlocks = srcBlockBuffers.nbSlices;
+    assert(dstBlockBuffers.nbSlices == srcBlockBuffers.nbSlices);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    assert(cctx != NULL);
+
+    size_t totalCSize = 0;
+    for (size_t blockNb=0; blockNb < nbBlocks; blockNb++) {
+        size_t cBlockSize;
+        if (cdict == NULL) {
+            cBlockSize = ZSTD_compressCCtx(cctx,
+                            dstBlockBuffers.slicePtrs[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.slicePtrs[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cLevel);
+        } else {
+            cBlockSize = ZSTD_compress_usingCDict(cctx,
+                            dstBlockBuffers.slicePtrs[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.slicePtrs[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cdict);
+        }
+        CONTROL(!ZSTD_isError(cBlockSize));
+        if (cSizes) cSizes[blockNb] = cBlockSize;
+        totalCSize += cBlockSize;
+    }
+    return totalCSize;
+}
+
+
+/* ---  Benchmark  --- */
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    size_t nbDicts;
+    size_t dictNb;
+    ddict_collection_t dictionaries;
+} decompressInstructions;
+
+decompressInstructions createDecompressInstructions(ddict_collection_t dictionaries)
+{
+    decompressInstructions di;
+    di.dctx = ZSTD_createDCtx();
+    assert(di.dctx != NULL);
+    di.nbDicts = dictionaries.nbDDict;
+    di.dictNb = 0;
+    di.dictionaries = dictionaries;
+    return di;
+}
+
+void freeDecompressInstructions(decompressInstructions di)
+{
+    ZSTD_freeDCtx(di.dctx);
+}
+
+/* benched function */
+size_t decompress(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* payload)
+{
+    decompressInstructions* const di = (decompressInstructions*) payload;
+
+    size_t const result = ZSTD_decompress_usingDDict(di->dctx,
+                                        dst, dstCapacity,
+                                        src, srcSize,
+                                        di->dictionaries.ddicts[di->dictNb]);
+
+    di->dictNb = di->dictNb + 1;
+    if (di->dictNb >= di->nbDicts) di->dictNb = 0;
+
+    return result;
+}
+
+
+static int benchMem(slice_collection_t dstBlocks,
+                    slice_collection_t srcBlocks,
+                    ddict_collection_t dictionaries,
+                    int nbRounds)
+{
+    assert(dstBlocks.nbSlices == srcBlocks.nbSlices);
+
+    unsigned const ms_per_round = RUN_TIME_DEFAULT_MS;
+    unsigned const total_time_ms = nbRounds * ms_per_round;
+
+    double bestSpeed = 0.;
+
+    BMK_timedFnState_t* const benchState =
+            BMK_createTimedFnState(total_time_ms, ms_per_round);
+    decompressInstructions di = createDecompressInstructions(dictionaries);
+
+    for (;;) {
+        BMK_runOutcome_t const outcome = BMK_benchTimedFn(benchState,
+                                decompress, &di,
+                                NULL, NULL,
+                                dstBlocks.nbSlices,
+                                (const void* const *)srcBlocks.slicePtrs, srcBlocks.capacities,
+                                dstBlocks.slicePtrs, dstBlocks.capacities,
+                                NULL);
+        CONTROL(BMK_isSuccessful_runOutcome(outcome));
+
+        BMK_runTime_t const result = BMK_extract_runTime(outcome);
+        U64 const dTime_ns = result.nanoSecPerRun;
+        double const dTime_sec = (double)dTime_ns / 1000000000;
+        size_t const srcSize = result.sumOfReturn;
+        double const dSpeed_MBps = (double)srcSize / dTime_sec / (1 MB);
+        if (dSpeed_MBps > bestSpeed) bestSpeed = dSpeed_MBps;
+        DISPLAY("Decompression Speed : %.1f MB/s \r", bestSpeed);
+        fflush(stdout);
+        if (BMK_isCompleted_TimedFn(benchState)) break;
+    }
+    DISPLAY("\n");
+
+    freeDecompressInstructions(di);
+    BMK_freeTimedFnState(benchState);
+
+    return 0;   /* success */
+}
+
+
+/*! bench() :
+ *  fileName : file to load for benchmarking purpose
+ *  dictionary : optional (can be NULL), file to load as dictionary,
+ *              if none provided : will be calculated on the fly by the program.
+ * @return : 0 is success, 1+ otherwise */
+int bench(const char** fileNameTable, unsigned nbFiles,
+          const char* dictionary,
+          size_t blockSize, int clevel,
+          unsigned nbDictMax, unsigned nbBlocks,
+          int nbRounds)
+{
+    int result = 0;
+
+    DISPLAYLEVEL(3, "loading %u files... \n", nbFiles);
+    buffer_collection_t const srcs = createBufferCollection_fromFiles(fileNameTable, nbFiles);
+    CONTROL(srcs.buffer.ptr != NULL);
+    buffer_t srcBuffer = srcs.buffer;
+    size_t const srcSize = srcBuffer.size;
+    DISPLAYLEVEL(3, "created src buffer of size %.1f MB \n",
+                    (double)srcSize / (1 MB));
+
+    slice_collection_t const srcSlices = splitSlices(srcs.slices, blockSize, nbBlocks);
+    nbBlocks = (unsigned)(srcSlices.nbSlices);
+    DISPLAYLEVEL(3, "split input into %u blocks ", nbBlocks);
+    if (blockSize)
+        DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
+    DISPLAYLEVEL(3, "\n");
+
+
+    size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
+    CONTROL(dstCapacities != NULL);
+    size_t dstBufferCapacity = 0;
+    for (size_t bnb=0; bnb<nbBlocks; bnb++) {
+        dstCapacities[bnb] = ZSTD_compressBound(srcSlices.capacities[bnb]);
+        dstBufferCapacity += dstCapacities[bnb];
+    }
+
+    buffer_t dstBuffer = createBuffer(dstBufferCapacity);
+    CONTROL(dstBuffer.ptr != NULL);
+
+    void** const sliceTable = malloc(nbBlocks * sizeof(*sliceTable));
+    CONTROL(sliceTable != NULL);
+
+    {   char* const ptr = dstBuffer.ptr;
+        size_t pos = 0;
+        for (size_t snb=0; snb < nbBlocks; snb++) {
+            sliceTable[snb] = ptr + pos;
+            pos += dstCapacities[snb];
+    }   }
+
+    slice_collection_t dstSlices;
+    dstSlices.capacities = dstCapacities;
+    dstSlices.slicePtrs = sliceTable;
+    dstSlices.nbSlices = nbBlocks;
+
+
+    /* dictionary determination */
+    buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
+                                srcBuffer.ptr,
+                                srcSlices.capacities, nbBlocks,
+                                DICTSIZE);
+    CONTROL(dictBuffer.ptr != NULL);
+
+    ZSTD_CDict* const cdict = ZSTD_createCDict(dictBuffer.ptr, dictBuffer.size, clevel);
+    CONTROL(cdict != NULL);
+
+    size_t const cTotalSizeNoDict = compressBlocks(NULL, dstSlices, srcSlices, NULL, clevel);
+    CONTROL(cTotalSizeNoDict != 0);
+    DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
+                    clevel,
+                    (double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+
+    size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
+    CONTROL(cSizes != NULL);
+
+    size_t const cTotalSize = compressBlocks(cSizes, dstSlices, srcSlices, cdict, clevel);
+    CONTROL(cTotalSize != 0);
+    DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
+                    (unsigned)dictBuffer.size,
+                    (double)srcSize / cTotalSize, (unsigned)cTotalSize);
+
+    /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
+    shrinkSizes(dstSlices, cSizes);
+
+    size_t const dictMem = ZSTD_estimateDDictSize(dictBuffer.size, ZSTD_dlm_byCopy);
+    unsigned const nbDicts = nbDictMax ? nbDictMax : nbBlocks;
+    size_t const allDictMem = dictMem * nbDicts;
+    DISPLAYLEVEL(3, "generating %u dictionaries, using %.1f MB of memory \n",
+                    nbDicts, (double)allDictMem / (1 MB));
+
+    ddict_collection_t const dictionaries = createDDictCollection(dictBuffer.ptr, dictBuffer.size, nbDicts);
+    CONTROL(dictionaries.ddicts != NULL);
+
+    shuffleDictionaries(dictionaries);
+
+    buffer_collection_t resultCollection = createBufferCollection_fromSliceCollectionSizes(srcSlices);
+    CONTROL(resultCollection.buffer.ptr != NULL);
+
+    result = benchMem(resultCollection.slices, dstSlices, dictionaries, nbRounds);
+
+    /* free all heap objects in reverse order */
+    freeBufferCollection(resultCollection);
+    freeDDictCollection(dictionaries);
+    free(cSizes);
+    ZSTD_freeCDict(cdict);
+    freeBuffer(dictBuffer);
+    freeSliceCollection(dstSlices);
+    freeBuffer(dstBuffer);
+    freeSliceCollection(srcSlices);
+    freeBufferCollection(srcs);
+
+    return result;
+}
+
+
+
+/* ---  Command Line  --- */
+
+/*! readU32FromChar() :
+ * @return : unsigned integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function will exit() program if digit sequence overflows */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        assert(result <= max);   /* check overflow */
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        assert(result <= maxK);   /* check overflow */
+        result <<= 10;
+        if (**stringPtr=='M') {
+            assert(result <= maxK);   /* check overflow */
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+int usage(const char* exeName)
+{
+    DISPLAY (" \n");
+    DISPLAY (" %s [Options] filename(s) \n", exeName);
+    DISPLAY (" \n");
+    DISPLAY ("Options : \n");
+    DISPLAY ("-r          : recursively load all files in subdirectories (default: off) \n");
+    DISPLAY ("-B#         : split input into blocks of size # (default: no split) \n");
+    DISPLAY ("-#          : use compression level # (default: %u) \n", CLEVEL_DEFAULT);
+    DISPLAY ("-D #        : use # as a dictionary (default: create one) \n");
+    DISPLAY ("-i#         : nb benchmark rounds (default: %u) \n", BENCH_TIME_DEFAULT_S);
+    DISPLAY ("--nbBlocks=#: use # blocks for bench (default: one per file) \n");
+    DISPLAY ("--nbDicts=# : create # dictionaries for bench (default: one per block) \n");
+    DISPLAY ("-h          : help (this text) \n");
+    return 0;
+}
+
+int bad_usage(const char* exeName)
+{
+    DISPLAY (" bad usage : \n");
+    usage(exeName);
+    return 1;
+}
+
+int main (int argc, const char** argv)
+{
+    int recursiveMode = 0;
+    int nbRounds = BENCH_TIME_DEFAULT_S;
+    const char* const exeName = argv[0];
+
+    if (argc < 2) return bad_usage(exeName);
+
+    const char** nameTable = (const char**)malloc(argc * sizeof(const char*));
+    assert(nameTable != NULL);
+    unsigned nameIdx = 0;
+
+    const char* dictionary = NULL;
+    int cLevel = CLEVEL_DEFAULT;
+    size_t blockSize = BLOCKSIZE_DEFAULT;
+    unsigned nbDicts = 0;  /* determine nbDicts automatically: 1 dictionary per block */
+    unsigned nbBlocks = 0; /* determine nbBlocks automatically, from source and blockSize */
+
+    for (int argNb = 1; argNb < argc ; argNb++) {
+        const char* argument = argv[argNb];
+        if (!strcmp(argument, "-h")) { free(nameTable); return usage(exeName); }
+        if (!strcmp(argument, "-r")) { recursiveMode = 1; continue; }
+        if (!strcmp(argument, "-D")) { argNb++; assert(argNb < argc); dictionary = argv[argNb]; continue; }
+        if (longCommandWArg(&argument, "-i")) { nbRounds = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--dictionary=")) { dictionary = argument; continue; }
+        if (longCommandWArg(&argument, "-B")) { blockSize = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--blockSize=")) { blockSize = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--nbDicts=")) { nbDicts = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--nbBlocks=")) { nbBlocks = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "--clevel=")) { cLevel = readU32FromChar(&argument); continue; }
+        if (longCommandWArg(&argument, "-")) { cLevel = readU32FromChar(&argument); continue; }
+        /* anything that's not a command is a filename */
+        nameTable[nameIdx++] = argument;
+    }
+
+    const char** filenameTable = nameTable;
+    unsigned nbFiles = nameIdx;
+    char* buffer_containing_filenames = NULL;
+
+    if (recursiveMode) {
+#ifndef UTIL_HAS_CREATEFILELIST
+        assert(0);   /* missing capability, do not run */
+#endif
+        filenameTable = UTIL_createFileList(nameTable, nameIdx, &buffer_containing_filenames, &nbFiles, 1 /* follow_links */);
+    }
+
+    int result = bench(filenameTable, nbFiles, dictionary, blockSize, cLevel, nbDicts, nbBlocks, nbRounds);
+
+    free(buffer_containing_filenames);
+    free(nameTable);
+
+    return result;
+}
diff --git a/contrib/seekable_format/examples/seekable_compression.c b/contrib/seekable_format/examples/seekable_compression.c
index 9485bf26f..9a331a895 100644
--- a/contrib/seekable_format/examples/seekable_compression.c
+++ b/contrib/seekable_format/examples/seekable_compression.c
@@ -101,7 +101,7 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
     free(buffOut);
 }
 
-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
     size_t const inL = strlen(filename);
     size_t const outL = inL + 5;
@@ -109,7 +109,7 @@ static const char* createOutFilename_orDie(const char* filename)
     memset(outSpace, 0, outL);
     strcat(outSpace, filename);
     strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }
 
 int main(int argc, const char** argv) {
@@ -124,8 +124,9 @@ int main(int argc, const char** argv) {
     {   const char* const inFileName = argv[1];
         unsigned const frameSize = (unsigned)atoi(argv[2]);
 
-        const char* const outFileName = createOutFilename_orDie(inFileName);
+        char* const outFileName = createOutFilename_orDie(inFileName);
         compressFile_orDie(inFileName, outFileName, 5, frameSize);
+        free(outFileName);
     }
 
     return 0;
diff --git a/contrib/seekable_format/examples/seekable_decompression.c b/contrib/seekable_format/examples/seekable_decompression.c
index 9cd232922..7050e0fa5 100644
--- a/contrib/seekable_format/examples/seekable_decompression.c
+++ b/contrib/seekable_format/examples/seekable_decompression.c
@@ -84,7 +84,7 @@ static void fseek_orDie(FILE* file, long int offset, int origin) {
 }
 
 
-static void decompressFile_orDie(const char* fname, unsigned startOffset, unsigned endOffset)
+static void decompressFile_orDie(const char* fname, off_t startOffset, off_t endOffset)
 {
     FILE* const fin  = fopen_orDie(fname, "rb");
     FILE* const fout = stdout;
@@ -129,8 +129,8 @@ int main(int argc, const char** argv)
 
     {
         const char* const inFilename = argv[1];
-        unsigned const startOffset = (unsigned) atoi(argv[2]);
-        unsigned const endOffset = (unsigned) atoi(argv[3]);
+        off_t const startOffset = atoll(argv[2]);
+        off_t const endOffset = atoll(argv[3]);
         decompressFile_orDie(inFilename, startOffset, endOffset);
     }
 
diff --git a/contrib/seekable_format/zstdseek_decompress.c b/contrib/seekable_format/zstdseek_decompress.c
index b006ff834..b4c48754e 100644
--- a/contrib/seekable_format/zstdseek_decompress.c
+++ b/contrib/seekable_format/zstdseek_decompress.c
@@ -56,6 +56,7 @@
 
 #include <stdlib.h> /* malloc, free */
 #include <stdio.h>  /* FILE* */
+#include <assert.h>
 
 #define XXH_STATIC_LINKING_ONLY
 #define XXH_NAMESPACE ZSTD_
@@ -112,7 +113,7 @@ static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n)
 
 static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
 {
-    buffWrapper_t* buff = (buffWrapper_t*) opaque;
+    buffWrapper_t* const buff = (buffWrapper_t*) opaque;
     unsigned long long newOffset;
     switch (origin) {
     case SEEK_SET:
@@ -124,6 +125,8 @@ static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
     case SEEK_END:
         newOffset = (unsigned long long)buff->size - offset;
         break;
+    default:
+        assert(0);  /* not possible */
     }
     if (newOffset > buff->size) {
         return -1;
@@ -310,8 +313,8 @@ static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs)
             /* compute cumulative positions */
             for (; idx < numFrames; idx++) {
                 if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) {
-                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
                     U32 const offset = SEEKABLE_BUFF_SIZE - pos;
+                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset);
                     memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */
                     CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead));
                     remaining -= toRead;
diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md
index 0b79f959f..c57b58269 100644
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
 
 ### Version
 
-0.2.8 (30/05/18)
+0.2.9 (05/09/18)
 
 
 Introduction
@@ -1192,6 +1192,8 @@ Number_of_Bits = Weight ? (Max_Number_of_Bits + 1 - Weight) : 0
 The last symbol's `Weight` is deduced from previously decoded ones,
 by completing to the nearest power of 2.
 This power of 2 gives `Max_Number_of_Bits`, the depth of the current tree.
+`Max_Number_of_Bits` must be <= 11,
+otherwise the representation is considered corrupted.
 
 __Example__ :
 Let's presume the following Huffman tree must be described :
@@ -1216,12 +1218,12 @@ It gives the following series of weights :
 |   `Weight`    |  4  |  3  |  2  |  0  |  1  |
 
 The decoder will do the inverse operation :
-having collected weights of literals from `0` to `4`,
+having collected weights of literal symbols from `0` to `4`,
 it knows the last literal, `5`, is present with a non-zero weight.
 The weight of `5` can be determined by advancing to the next power of 2.
 The sum of `2^(Weight-1)` (excluding 0's) is :
 `8 + 4 + 2 + 0 + 1 = 15`.
-Nearest power of 2 is 16.
+Nearest larger power of 2 value is 16.
 Therefore, `Max_Number_of_Bits = 4` and `Weight[5] = 16-15 = 1`.
 
 #### Huffman Tree header
@@ -1233,18 +1235,24 @@ which describes how the series of weights is encoded.
   the series of weights is compressed using FSE (see below).
   The length of the FSE-compressed series is equal to `headerByte` (0-127).
 
-- if `headerByte` >= 128 : this is a direct representation,
-  where each `Weight` is written directly as a 4 bits field (0-15).
-  They are encoded forward, 2 weights to a byte with the first weight taking
-  the top four bits and the second taking the bottom four (e.g. the following
-  operations could be used to read the weights:
-  `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.).
-  The full representation occupies `Ceiling(Number_of_Symbols/2)` bytes,
-  meaning it uses only full bytes even if `Number_of_Symbols` is odd.
-  `Number_of_Symbols = headerByte - 127`.
-  Note that maximum `Number_of_Symbols` is 255-127 = 128.
-  If any literal has a value > 128, raw header mode is not possible.
-  In such case, it's necessary to use FSE compression.
+- if `headerByte` >= 128 :
+  + the series of weights uses a direct representation,
+    where each `Weight` is encoded directly as a 4 bits field (0-15).
+  + They are encoded forward, 2 weights to a byte,
+    first weight taking the top four bits and second one taking the bottom four.
+    * e.g. the following operations could be used to read the weights:
+      `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.
+  + The full representation occupies `Ceiling(Number_of_Weights/2)` bytes,
+    meaning it uses only full bytes even if `Number_of_Weights` is odd.
+  + `Number_of_Weights = headerByte - 127`.
+    * Note that maximum `Number_of_Weights` is 255-127 = 128,
+      therefore, only up to 128 `Weight` can be encoded using direct representation.
+    * Since the last non-zero `Weight` is _not_ encoded,
+      this scheme is compatible with alphabet sizes of up to 129 symbols,
+      hence including literal symbol 128.
+    * If any literal symbol > 128 has a non-zero `Weight`,
+      direct representation is not possible.
+      In such case, it's necessary to use FSE compression.
 
 
 #### Finite State Entropy (FSE) compression of Huffman weights
@@ -1621,6 +1629,7 @@ or at least provide a meaningful error code explaining for which reason it canno
 
 Version changes
 ---------------
+- 0.2.9 : clarifications for huffman weights direct representation, by Ulrich Kunitz
 - 0.2.8 : clarifications for IETF RFC discuss
 - 0.2.7 : clarifications from IETF RFC review, by Vijay Gurbani and Nick Terrell
 - 0.2.6 : fixed an error in huffman example, by Ulrich Kunitz
diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index 48b9ca7f2..e5aade5b2 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -35,22 +35,34 @@
 </ol>
 <hr>
 <a name="Chapter1"></a><h2>Introduction</h2><pre>
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
 <BR></pre>
 
 <a name="Chapter2"></a><h2>Version</h2><pre></pre>
@@ -181,7 +193,8 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </b><p>  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
   ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
   ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
-  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict 
+  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. 
 </p></pre><BR>
 
 <pre><b>size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
@@ -195,7 +208,9 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </b><p>  Compression using a digested Dictionary.
   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
   Note that compression level is decided during dictionary creation.
-  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) 
+  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). 
 </p></pre><BR>
 
 <pre><b>ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
@@ -965,16 +980,21 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx,
                            const void* prefix, size_t prefixSize,
                            ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
-  Decompression need same prefix to properly regenerate data.
-  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+  Decompression will need same prefix to properly regenerate data.
+  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
   Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
   Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
            Its contain must remain unmodified up to end of compression (ZSTD_e_end).
-  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+           ensure that the window size is large enough to contain the entire source.
+           See ZSTD_p_windowLog.
+  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
            It's a CPU consuming operation, with non-negligible impact on latency.
            If there is a need to use same prefix multiple times, consider loadDictionary instead.
-  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
            Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. 
 </p></pre><BR>
 
@@ -1141,6 +1161,8 @@ size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx,
                         const void* prefix, size_t prefixSize,
                         ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
+  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+  and must use the same prefix as the one used during compression.
   Prefix is **only used once**. Reference is discarded at end of frame.
   End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
diff --git a/examples/multiple_streaming_compression.c b/examples/multiple_streaming_compression.c
index e395aefbc..4308a2e4d 100644
--- a/examples/multiple_streaming_compression.c
+++ b/examples/multiple_streaming_compression.c
@@ -158,7 +158,8 @@ int main(int argc, const char** argv)
     }
 
     freeResources(ress);
-    /* success */
+    free(ofnBuffer);
+
     printf("compressed %i files \n", argc-1);
 
     return 0;
diff --git a/examples/streaming_compression.c b/examples/streaming_compression.c
index f76364d84..9287ff398 100644
--- a/examples/streaming_compression.c
+++ b/examples/streaming_compression.c
@@ -73,7 +73,11 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
     ZSTD_CStream* const cstream = ZSTD_createCStream();
     if (cstream==NULL) { fprintf(stderr, "ZSTD_createCStream() error \n"); exit(10); }
     size_t const initResult = ZSTD_initCStream(cstream, cLevel);
-    if (ZSTD_isError(initResult)) { fprintf(stderr, "ZSTD_initCStream() error : %s \n", ZSTD_getErrorName(initResult)); exit(11); }
+    if (ZSTD_isError(initResult)) {
+        fprintf(stderr, "ZSTD_initCStream() error : %s \n",
+                    ZSTD_getErrorName(initResult));
+        exit(11);
+    }
 
     size_t read, toRead = buffInSize;
     while( (read = fread_orDie(buffIn, toRead, fin)) ) {
@@ -81,7 +85,11 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
         while (input.pos < input.size) {
             ZSTD_outBuffer output = { buffOut, buffOutSize, 0 };
             toRead = ZSTD_compressStream(cstream, &output , &input);   /* toRead is guaranteed to be <= ZSTD_CStreamInSize() */
-            if (ZSTD_isError(toRead)) { fprintf(stderr, "ZSTD_compressStream() error : %s \n", ZSTD_getErrorName(toRead)); exit(12); }
+            if (ZSTD_isError(toRead)) {
+                fprintf(stderr, "ZSTD_compressStream() error : %s \n",
+                                ZSTD_getErrorName(toRead));
+                exit(12);
+            }
             if (toRead > buffInSize) toRead = buffInSize;   /* Safely handle case when `buffInSize` is manually changed to a value < ZSTD_CStreamInSize()*/
             fwrite_orDie(buffOut, output.pos, fout);
         }
@@ -100,15 +108,15 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
 }
 
 
-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
     size_t const inL = strlen(filename);
     size_t const outL = inL + 5;
-    void* outSpace = malloc_orDie(outL);
+    void* const outSpace = malloc_orDie(outL);
     memset(outSpace, 0, outL);
     strcat(outSpace, filename);
     strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }
 
 int main(int argc, const char** argv)
@@ -124,8 +132,10 @@ int main(int argc, const char** argv)
 
     const char* const inFilename = argv[1];
 
-    const char* const outFilename = createOutFilename_orDie(inFilename);
+    char* const outFilename = createOutFilename_orDie(inFilename);
     compressFile_orDie(inFilename, outFilename, 1);
 
+    free(outFilename);   /* not strictly required, since program execution stops there,
+                          * but some static analyzer main complain otherwise */
     return 0;
 }
diff --git a/lib/BUCK b/lib/BUCK
index dbe8885fc..bd93b082a 100644
--- a/lib/BUCK
+++ b/lib/BUCK
@@ -69,6 +69,7 @@ cxx_library(
     ]),
     headers=subdir_glob([
         ('dictBuilder', 'divsufsort.h'),
+        ('dictBuilder', 'cover.h'),
     ]),
     srcs=glob(['dictBuilder/*.c']),
     deps=[':common'],
diff --git a/lib/Makefile b/lib/Makefile
index 01689c6d5..cf8e45b0f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,7 +23,7 @@ ifeq ($(OS),Windows_NT)   # MinGW assumed
 CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
 endif
 CFLAGS  ?= -O3
-DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index 595bac204..07f875e4d 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -89,20 +89,37 @@
 #endif
 
 /* prefetch
- * can be disabled, by declaring NO_PREFETCH macro */
+ * can be disabled, by declaring NO_PREFETCH macro
+ * All prefetch invocations use a single default locality 2,
+ * generating instruction prefetcht1,
+ * which, according to Intel, means "load data into L2 cache".
+ * This is a good enough "middle ground" for the time being,
+ * though in theory, it would be better to specialize locality depending on data being prefetched.
+ * Tests could not determine any sensible difference based on locality value. */
 #if defined(NO_PREFETCH)
-#  define PREFETCH(ptr)   /* disabled */
+#  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#    define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#    define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
 #  else
-#    define PREFETCH(ptr)   /* disabled */
+#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
 #  endif
 #endif  /* NO_PREFETCH */
 
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH(_ptr + _pos);            \
+    }                                     \
+}
+
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
 #  include <intrin.h>                    /* For Visual 2005 */
diff --git a/lib/common/cpu.h b/lib/common/cpu.h
index 88e0ebf44..eeb428ad5 100644
--- a/lib/common/cpu.h
+++ b/lib/common/cpu.h
@@ -36,7 +36,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
     U32 f1d = 0;
     U32 f7b = 0;
     U32 f7c = 0;
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
     int reg[4];
     __cpuid((int*)reg, 0);
     {
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index b4c1af53f..e75adfa61 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -79,8 +79,7 @@ static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
 static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
 static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
 
-#define ZSTD_FRAMEIDSIZE 4
-static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
 
 #define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
 static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
@@ -193,6 +192,8 @@ typedef struct {
     BYTE* llCode;
     BYTE* mlCode;
     BYTE* ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
     U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
     U32   longLengthPos;
 } seqStore_t;
diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c
index 70daae3bc..4075db217 100644
--- a/lib/compress/fse_compress.c
+++ b/lib/compress/fse_compress.c
@@ -83,7 +83,9 @@
  * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
  * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
  */
-size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
 {
     U32 const tableSize = 1 << tableLog;
     U32 const tableMask = tableSize - 1;
@@ -101,10 +103,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
     if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
     tableU16[-2] = (U16) tableLog;
     tableU16[-1] = (U16) maxSymbolValue;
-    assert(tableLog < 16);   /* required for the threshold strategy to work */
+    assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
 
     /* symbol start positions */
     {   U32 u;
@@ -124,13 +130,15 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
         U32 symbol;
         for (symbol=0; symbol<=maxSymbolValue; symbol++) {
             int nbOccurences;
-            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurences=0; nbOccurences<freq; nbOccurences++) {
                 tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
         }   }
 
-        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+        assert(position==0);  /* Must have initialized all positions */
     }
 
     /* Build table */
@@ -201,9 +209,10 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
-static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned writeIsSafe)
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
 {
     BYTE* const ostart = (BYTE*) header;
     BYTE* out = ostart;
@@ -212,13 +221,12 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     const int tableSize = 1 << tableLog;
     int remaining;
     int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
 
-    bitStream = 0;
-    bitCount  = 0;
     /* Table Size */
     bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
     bitCount  += 4;
@@ -228,48 +236,53 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     threshold = tableSize;
     nbBits = tableLog+1;
 
-    while (remaining>1) {  /* stops at 1 */
-        if (previous0) {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24) {
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
-                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE) bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out+=2;
                 bitStream>>=16;
             }
-            while (charnum >= start+3) {
+            while (symbol >= start+3) {
                 start+=3;
                 bitStream += 3 << bitCount;
                 bitCount += 2;
             }
-            bitStream += (charnum-start) << bitCount;
+            bitStream += (symbol-start) << bitCount;
             bitCount += 2;
             if (bitCount>16) {
-                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
                 bitStream >>= 16;
                 bitCount -= 16;
         }   }
-        {   int count = normalizedCounter[charnum++];
-            int const max = (2*threshold-1)-remaining;
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
             remaining -= count < 0 ? -count : count;
             count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
             bitStream += count << bitCount;
             bitCount  += nbBits;
             bitCount  -= (count<max);
-            previous0  = (count==1);
+            previousIs0  = (count==1);
             if (remaining<1) return ERROR(GENERIC);
             while (remaining<threshold) { nbBits--; threshold>>=1; }
         }
         if (bitCount>16) {
-            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
@@ -277,19 +290,23 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             bitCount -= 16;
     }   }
 
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
     /* flush remaining bitStream */
-    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
     out[0] = (BYTE)bitStream;
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
 
-    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
-
     return (out-ostart);
 }
 
 
-size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
@@ -297,7 +314,7 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized
     if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
         return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
 
-    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
 }
 
 
diff --git a/lib/compress/hist.h b/lib/compress/hist.h
index 788470da7..8b1991a90 100644
--- a/lib/compress/hist.h
+++ b/lib/compress/hist.h
@@ -50,7 +50,7 @@
 size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
                   const void* src, size_t srcSize);
 
-unsigned HIST_isError(size_t code);  /*< tells if a return value is an error code */
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
 
 
 /* --- advanced histogram functions --- */
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index e09eef3d1..513156604 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -805,7 +805,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
         U32    const divider = (cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const entropySpace = HUF_WORKSPACE_SIZE;
         size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
         size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
@@ -949,33 +949,51 @@ typedef enum { ZSTDb_not_buffered, ZSTDb_buffered } ZSTD_buffered_policy_e;
 /* ZSTD_sufficientBuff() :
  * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
  * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
-static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t maxNbSeq1,
+                            size_t maxNbLit1,
                             ZSTD_buffered_policy_e buffPol2,
                             ZSTD_compressionParameters cParams2,
                             U64 pledgedSrcSize)
 {
     size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
     size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const maxNbSeq2 = blockSize2 / ((cParams2.searchLength == 3) ? 3 : 4);
+    size_t const maxNbLit2 = blockSize2;
     size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
-                (U32)windowSize2, cParams2.windowLog);
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
-                (U32)blockSize2, (U32)blockSize1);
-    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is neededBufferSize2=%u <= bufferSize1=%u",
+                (U32)neededBufferSize2, (U32)bufferSize1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbSeq2=%u <= maxNbSeq1=%u",
+                (U32)maxNbSeq2, (U32)maxNbSeq1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbLit2=%u <= maxNbLit1=%u",
+                (U32)maxNbLit2, (U32)maxNbLit1);
+    return (maxNbLit2 <= maxNbLit1)
+         & (maxNbSeq2 <= maxNbSeq1)
          & (neededBufferSize2 <= bufferSize1);
 }
 
 /** Equivalence for resetCCtx purposes */
 static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
                                  ZSTD_CCtx_params params2,
-                                 size_t buffSize1, size_t blockSize1,
+                                 size_t buffSize1,
+                                 size_t maxNbSeq1, size_t maxNbLit1,
                                  ZSTD_buffered_policy_e buffPol2,
                                  U64 pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
-    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
-           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
-           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+    if (!ZSTD_equivalentCParams(params1.cParams, params2.cParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentCParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentLdmParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_sufficientBuff(buffSize1, maxNbSeq1, maxNbLit1, buffPol2,
+                             params2.cParams, pledgedSrcSize)) {
+      DEBUGLOG(4, "ZSTD_sufficientBuff() == 0");
+      return 0;
+    }
+    return 1;
 }
 
 static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
@@ -1103,8 +1121,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
     if (crp == ZSTDcrp_continue) {
         if (ZSTD_equivalentParams(zc->appliedParams, params,
-                                zc->inBuffSize, zc->blockSize,
-                                zbuff, pledgedSrcSize)) {
+                                  zc->inBuffSize,
+                                  zc->seqStore.maxNbSeq, zc->seqStore.maxNbLit,
+                                  zbuff, pledgedSrcSize)) {
             DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%zu)",
                         zc->appliedParams.cParams.windowLog, zc->blockSize);
             zc->workSpaceOversizedDuration += (zc->workSpaceOversizedDuration > 0);   /* if it was too large, it still is */
@@ -1125,7 +1144,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
         U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
         size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
         size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
@@ -1165,7 +1184,6 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
                 if (zc->workSpace == NULL) return ERROR(memory_allocation);
                 zc->workSpaceSize = neededSpace;
                 zc->workSpaceOversizedDuration = 0;
-                ptr = zc->workSpace;
 
                 /* Statically sized space.
                  * entropyWorkspace never moves,
@@ -1216,13 +1234,18 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);
 
         /* sequences storage */
+        zc->seqStore.maxNbSeq = maxNbSeq;
         zc->seqStore.sequencesStart = (seqDef*)ptr;
         ptr = zc->seqStore.sequencesStart + maxNbSeq;
         zc->seqStore.llCode = (BYTE*) ptr;
         zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
         zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
         zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
-        ptr = zc->seqStore.litStart + blockSize;
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.maxNbLit = blockSize;
+        ptr = zc->seqStore.litStart + blockSize + WILDCOPY_OVERLENGTH;
 
         /* ldm bucketOffsets table */
         if (params.ldmParams.enableLdm) {
@@ -1341,8 +1364,7 @@ static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
         }
 
         /* copy dictionary offsets */
-        {
-            ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
             ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
             dstMatchState->window       = srcMatchState->window;
             dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
@@ -1666,6 +1688,7 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
         U32 const mlv = sequences[u].matchLength;
@@ -2254,13 +2277,6 @@ MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
 
-    /* We check that dictionaries have offset codes available for the first
-     * block. After the first block, the offcode table might not have large
-     * enough codes to represent the offsets in the data.
-     */
-    if (nextEntropy->fse.offcode_repeatMode == FSE_repeat_valid)
-        nextEntropy->fse.offcode_repeatMode = FSE_repeat_check;
-
     return cSize;
 }
 
@@ -2399,12 +2415,20 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                                 &zc->appliedParams,
                                 dst, dstCapacity,
                                 srcSize, zc->entropyWorkspace, zc->bmi2);
-        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
-        /* confirm repcodes and entropy tables */
-        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+        if (!ZSTD_isError(cSize) && cSize != 0) {
+            /* confirm repcodes and entropy tables */
+            ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
             zc->blockState.prevCBlock = zc->blockState.nextCBlock;
             zc->blockState.nextCBlock = tmp;
         }
+
+        /* We check that dictionaries have offset codes available for the first
+         * block. After the first block, the offcode table might not have large
+         * enough codes to represent the offsets in the data.
+         */
+        if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+            zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
         return cSize;
     }
 }
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index d31542c69..1c95b7de9 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -314,8 +314,10 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
                pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
     }
 #endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
     /* copy Literals */
-    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
     ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
     seqStorePtr->lit += litLength;
 
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 476cdc148..c4b9bb13b 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -970,7 +970,7 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
             U32 seqPos = cur;
 
             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
-                        last_pos, cur);
+                        last_pos, cur); (void)last_pos;
             assert(storeEnd < ZSTD_OPT_NUM);
             DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
                         storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c
index cc1af58e6..244690cdc 100644
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@@ -320,7 +320,8 @@ static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
 
 static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
 {
-    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    if (seqPool == NULL) return NULL;
     ZSTDMT_setNbSeq(seqPool, 0);
     return seqPool;
 }
diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c
index a696261bd..83ecaff01 100644
--- a/lib/decompress/huf_decompress.c
+++ b/lib/decompress/huf_decompress.c
@@ -533,9 +533,9 @@ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
     }
 }
 
-size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src,
-                             size_t srcSize, void* workSpace,
-                             size_t wkspSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
 {
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 8f4589d13..1382c9c7f 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -40,7 +40,6 @@
 #  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
 #endif
 
-
 /*!
  *  NO_FORWARD_PROGRESS_MAX :
  *  maximum allowed nb of calls to ZSTD_decompressStream() and ZSTD_decompress_generic()
@@ -52,11 +51,13 @@
 #  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
 #endif
 
+
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "cpu.h"
+#include "compiler.h"    /* prefetch */
+#include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
@@ -68,6 +69,9 @@
 #  include "zstd_legacy.h"
 #endif
 
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
 
 /*-*************************************
 *  Errors
@@ -110,11 +114,10 @@ typedef struct {
 #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
 
 typedef struct {
-    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
-    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
-    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
     HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
-    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyDTables_t;
 
@@ -125,6 +128,7 @@ struct ZSTD_DCtx_s
     const ZSTD_seqSymbol* OFTptr;
     const HUF_DTable* HUFptr;
     ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
     const void* previousDstEnd;   /* detect continuity */
     const void* prefixStart;      /* start of current segment */
     const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
@@ -138,7 +142,6 @@ struct ZSTD_DCtx_s
     U32 fseEntropy;
     XXH64_state_t xxhState;
     size_t headerSize;
-    U32 dictID;
     ZSTD_format_e format;
     const BYTE* litPtr;
     ZSTD_customMem customMem;
@@ -147,9 +150,13 @@ struct ZSTD_DCtx_s
     size_t staticSize;
     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
 
-    /* streaming */
+    /* dictionary */
     ZSTD_DDict* ddictLocal;
-    const ZSTD_DDict* ddict;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+
+    /* streaming */
     ZSTD_dStreamStage streamStage;
     char*  inBuff;
     size_t inBuffSize;
@@ -185,7 +192,7 @@ size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
 static size_t ZSTD_startingInputLength(ZSTD_format_e format)
 {
     size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
-                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix - ZSTD_FRAMEIDSIZE :
                     ZSTD_frameHeaderSize_prefix;
     ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
     /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
@@ -200,6 +207,8 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
     dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
     dctx->ddict       = NULL;
     dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
     dctx->inBuff      = NULL;
     dctx->inBuffSize  = 0;
     dctx->outBuffSize = 0;
@@ -278,7 +287,7 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
  *  Note 3 : Skippable Frame Identifiers are considered valid. */
 unsigned ZSTD_isFrame(const void* buffer, size_t size)
 {
-    if (size < ZSTD_frameIdSize) return 0;
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
     {   U32 const magic = MEM_readLE32(buffer);
         if (magic == ZSTD_MAGICNUMBER) return 1;
         if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
@@ -330,7 +339,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
+    memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
     if (srcSize < minInputSize) return minInputSize;
+    if (src==NULL) return ERROR(GENERIC);   /* invalid parameter */
 
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
@@ -339,7 +350,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ZSTD_skippableHeaderSize; /* magic number + frame length */
             memset(zfhPtr, 0, sizeof(*zfhPtr));
-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
             zfhPtr->frameType = ZSTD_skippableFrame;
             return 0;
         }
@@ -451,7 +462,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
             size_t skippableSize;
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ERROR(srcSize_wrong);
-            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_FRAMEIDSIZE)
                           + ZSTD_skippableHeaderSize;
             if (srcSize < skippableSize) {
                 return ZSTD_CONTENTSIZE_ERROR;
@@ -540,6 +551,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
 static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
                           const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -572,6 +584,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         case set_repeat:
             if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
             /* fall-through */
+
         case set_compressed:
             if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
             {   size_t lhSize, litSize, litCSize;
@@ -603,15 +616,20 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
                 if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
 
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
                 if (HUF_isError((litEncType==set_repeat) ?
                                     ( singleStream ?
                                         HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
                                         HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
                                     ( singleStream ?
                                         HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                                                         dctx->workspace, sizeof(dctx->workspace), dctx->bmi2) :
                                         HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                                                                           dctx->workspace, sizeof(dctx->workspace), dctx->bmi2))))
                     return ERROR(corruption_detected);
 
                 dctx->litPtr = dctx->litBuffer;
@@ -883,7 +901,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
                                  symbolEncodingType_e type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const U32* baseValue, const U32* nbAdditionalBits,
-                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq)
 {
     switch(type)
     {
@@ -902,6 +921,12 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
         return 0;
     case set_repeat:
         if (!flagRepeatTable) return ERROR(corruption_detected);
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
         return 0;
     case set_compressed :
         {   U32 tableLog;
@@ -954,25 +979,25 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
     const BYTE* const istart = (const BYTE* const)src;
     const BYTE* const iend = istart + srcSize;
     const BYTE* ip = istart;
+    int nbSeq;
     DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
 
     /* check */
     if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
 
     /* SeqHead */
-    {   int nbSeq = *ip++;
-        if (!nbSeq) { *nbSeqPtr=0; return 1; }
-        if (nbSeq > 0x7F) {
-            if (nbSeq == 0xFF) {
-                if (ip+2 > iend) return ERROR(srcSize_wrong);
-                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
-            } else {
-                if (ip >= iend) return ERROR(srcSize_wrong);
-                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
-            }
+    nbSeq = *ip++;
+    if (!nbSeq) { *nbSeqPtr=0; return 1; }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            if (ip+2 > iend) return ERROR(srcSize_wrong);
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            if (ip >= iend) return ERROR(srcSize_wrong);
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
         }
-        *nbSeqPtr = nbSeq;
     }
+    *nbSeqPtr = nbSeq;
 
     /* FSE table descriptors */
     if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
@@ -986,7 +1011,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                       LLtype, MaxLL, LLFSELog,
                                                       ip, iend-ip,
                                                       LL_base, LL_bits,
-                                                      LL_defaultDTable, dctx->fseEntropy);
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
             ip += llhSize;
         }
@@ -995,7 +1021,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                       OFtype, MaxOff, OffFSELog,
                                                       ip, iend-ip,
                                                       OF_base, OF_bits,
-                                                      OF_defaultDTable, dctx->fseEntropy);
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
             ip += ofhSize;
         }
@@ -1004,12 +1031,23 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                       MLtype, MaxML, MLFSELog,
                                                       ip, iend-ip,
                                                       ML_base, ML_bits,
-                                                      ML_defaultDTable, dctx->fseEntropy);
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
             ip += mlhSize;
         }
     }
 
+    /* prefetch dictionary content */
+    if (dctx->ddictIsCold) {
+        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
+        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
+        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
+        const void* const pStart = (const char*)dctx->dictEnd - pSize;
+        PREFETCH_AREA(pStart, pSize);
+        dctx->ddictIsCold = 0;
+    }
+
     return ip-istart;
 }
 
@@ -1676,7 +1714,8 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
     /* isLongOffset must be true if there are long offsets.
      * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
      * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
      */
     ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
@@ -1763,7 +1802,7 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
 #endif
     if ( (srcSize >= ZSTD_skippableHeaderSize)
       && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
-        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE);
     } else {
         const BYTE* ip = (const BYTE*)src;
         const BYTE* const ipstart = ip;
@@ -1797,7 +1836,6 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
         if (zfh.checksumFlag) {   /* Final frame content checksum */
             if (remainingSize < 4) return ERROR(srcSize_wrong);
             ip += 4;
-            remainingSize -= 4;
         }
 
         return ip - ipstart;
@@ -1885,9 +1923,6 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
     return op-ostart;
 }
 
-static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
-static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
-
 static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                         void* dst, size_t dstCapacity,
                                   const void* src, size_t srcSize,
@@ -1896,6 +1931,8 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
 {
     void* const dststart = dst;
     int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
     assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
 
     if (ddict) {
@@ -1932,7 +1969,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                 size_t skippableSize;
                 if (srcSize < ZSTD_skippableHeaderSize)
                     return ERROR(srcSize_wrong);
-                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE)
                               + ZSTD_skippableHeaderSize;
                 if (srcSize < skippableSize) return ERROR(srcSize_wrong);
 
@@ -2057,7 +2094,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
     case ZSTDds_getFrameHeaderSize :
         assert(src != NULL);
         if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
-            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
             if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
                 memcpy(dctx->headerBuffer, src, srcSize);
                 dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
@@ -2167,7 +2204,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
         assert(src != NULL);
         assert(srcSize <= ZSTD_skippableHeaderSize);
         memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
-        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
         dctx->stage = ZSTDds_skipFrame;
         return 0;
 
@@ -2191,21 +2228,27 @@ static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dict
     return 0;
 }
 
-/* ZSTD_loadEntropy() :
- * dict : must point at beginning of a valid zstd dictionary
+/*! ZSTD_loadEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
  * @return : size of entropy tables read */
-static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy,
+                         const void* const dict, size_t const dictSize)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
 
     if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
     dictPtr += 8;   /* skip header = magic + dictID */
 
-
-    {   size_t const hSize = HUF_readDTableX2_wksp(
-            entropy->hufTable, dictPtr, dictEnd - dictPtr,
-            entropy->workspace, sizeof(entropy->workspace));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
         if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
         dictPtr += hSize;
     }
@@ -2216,7 +2259,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->OFTable,
+        ZSTD_buildFSETable( entropy->OFTable,
                             offcodeNCount, offcodeMaxValue,
                             OF_base, OF_bits,
                             offcodeLog);
@@ -2229,7 +2272,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
         if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->MLTable,
+        ZSTD_buildFSETable( entropy->MLTable,
                             matchlengthNCount, matchlengthMaxValue,
                             ML_base, ML_bits,
                             matchlengthLog);
@@ -2242,7 +2285,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
         if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->LLTable,
+        ZSTD_buildFSETable( entropy->LLTable,
                             litlengthNCount, litlengthMaxValue,
                             LL_base, LL_bits,
                             litlengthLog);
@@ -2268,7 +2311,7 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict
         if (magic != ZSTD_MAGIC_DICTIONARY) {
             return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
     }   }
-    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
     {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
@@ -2282,7 +2325,6 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict
     return ZSTD_refDictContent(dctx, dict, dictSize);
 }
 
-/* Note : this function cannot fail */
 size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
     assert(dctx != NULL);
@@ -2328,42 +2370,53 @@ struct ZSTD_DDict_s {
 
 static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictContent;
 }
 
 static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictSize;
 }
 
-size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
 {
-    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
-    if (ddict) {   /* support begin on NULL */
-        dstDCtx->dictID = ddict->dictID;
-        dstDCtx->prefixStart = ddict->dictContent;
-        dstDCtx->virtualStart = ddict->dictContent;
-        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
-        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        dctx->ddictIsCold = (dctx->dictEnd != (const char*)ddict->dictContent + ddict->dictSize);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        dctx->dictID = ddict->dictID;
+        dctx->prefixStart = ddict->dictContent;
+        dctx->virtualStart = ddict->dictContent;
+        dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dctx->previousDstEnd = dctx->dictEnd;
         if (ddict->entropyPresent) {
-            dstDCtx->litEntropy = 1;
-            dstDCtx->fseEntropy = 1;
-            dstDCtx->LLTptr = ddict->entropy.LLTable;
-            dstDCtx->MLTptr = ddict->entropy.MLTable;
-            dstDCtx->OFTptr = ddict->entropy.OFTable;
-            dstDCtx->HUFptr = ddict->entropy.hufTable;
-            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
-            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
-            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+            dctx->litEntropy = 1;
+            dctx->fseEntropy = 1;
+            dctx->LLTptr = ddict->entropy.LLTable;
+            dctx->MLTptr = ddict->entropy.MLTable;
+            dctx->OFTptr = ddict->entropy.OFTable;
+            dctx->HUFptr = ddict->entropy.hufTable;
+            dctx->entropy.rep[0] = ddict->entropy.rep[0];
+            dctx->entropy.rep[1] = ddict->entropy.rep[1];
+            dctx->entropy.rep[2] = ddict->entropy.rep[2];
         } else {
-            dstDCtx->litEntropy = 0;
-            dstDCtx->fseEntropy = 0;
+            dctx->litEntropy = 0;
+            dctx->fseEntropy = 0;
         }
     }
     return 0;
 }
 
-static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+static size_t
+ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict,
+                         ZSTD_dictContentType_e dictContentType)
 {
     ddict->dictID = 0;
     ddict->entropyPresent = 0;
@@ -2381,10 +2434,12 @@ static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e
             return 0;   /* pure content mode */
         }
     }
-    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
-    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy,
+                              ddict->dictContent, ddict->dictSize),
+             dictionary_corrupted );
     ddict->entropyPresent = 1;
     return 0;
 }
@@ -2398,6 +2453,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
         ddict->dictBuffer = NULL;
         ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
     } else {
         void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
         ddict->dictBuffer = internalBuffer;
@@ -2422,14 +2478,15 @@ ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
 
     {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
-        if (!ddict) return NULL;
+        if (ddict == NULL) return NULL;
         ddict->cMem = customMem;
-
-        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
-            ZSTD_freeDDict(ddict);
-            return NULL;
-        }
-
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
         return ddict;
     }
 }
@@ -2456,23 +2513,25 @@ ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize
 
 
 const ZSTD_DDict* ZSTD_initStaticDDict(
-                                void* workspace, size_t workspaceSize,
+                                void* sBuffer, size_t sBufferSize,
                                 const void* dict, size_t dictSize,
                                 ZSTD_dictLoadMethod_e dictLoadMethod,
                                 ZSTD_dictContentType_e dictContentType)
 {
-    size_t const neededSpace =
-            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
-    assert(workspace != NULL);
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
     assert(dict != NULL);
-    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
-    if (workspaceSize < neededSpace) return NULL;
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
     if (dictLoadMethod == ZSTD_dlm_byCopy) {
         memcpy(ddict+1, dict, dictSize);  /* local copy */
         dict = ddict+1;
     }
-    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
         return NULL;
     return ddict;
 }
@@ -2510,7 +2569,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
 {
     if (dictSize < 8) return 0;
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
-    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 }
 
 /*! ZSTD_getDictID_fromDDict() :
@@ -2586,12 +2645,15 @@ size_t ZSTD_freeDStream(ZSTD_DStream* zds)
 }
 
 
-/* *** Initialization *** */
+/* ***  Initialization  *** */
 
 size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
 size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
 
-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
     ZSTD_freeDDict(dctx->ddictLocal);
@@ -2645,13 +2707,6 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
     return ZSTD_initDStream_usingDict(zds, NULL, 0);
 }
 
-size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
-{
-    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
-    dctx->ddict = ddict;
-    return 0;
-}
-
 /* ZSTD_initDStream_usingDDict() :
  * ddict will just be referenced, and must outlive decompression session
  * this function cannot fail */
@@ -2690,6 +2745,13 @@ size_t ZSTD_setDStreamParameter(ZSTD_DStream* dctx,
     return 0;
 }
 
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
 size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
@@ -2855,7 +2917,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
 
             if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
-                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                 zds->stage = ZSTDds_skipFrame;
             } else {
                 CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index e32991652..6b4af69d2 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -29,6 +29,7 @@
 #include "mem.h" /* read */
 #include "pool.h"
 #include "threading.h"
+#include "cover.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
@@ -185,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
 }
 
 /**
- * Destroyes a map that is inited with COVER_map_init().
+ * Destroys a map that is inited with COVER_map_init().
  */
 static void COVER_map_destroy(COVER_map_t *map) {
   if (map->data) {
@@ -223,7 +224,7 @@ static COVER_ctx_t *g_ctx = NULL;
 /**
  * Returns the sum of the sample sizes.
  */
-static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
   size_t sum = 0;
   unsigned i;
   for (i = 0; i < nbSamples; ++i) {
@@ -380,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
   ctx->suffix[dmerId] = freq;
 }
 
-/**
- * A segment is a range in the source as well as the score of the segment.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-  U32 score;
-} COVER_segment_t;
 
 /**
  * Selects the best segment in an epoch.
@@ -737,28 +730,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
   }
 }
 
-/**
- * COVER_best_t is used for two purposes:
- * 1. Synchronizing threads.
- * 2. Saving the best parameters and dictionary.
- *
- * All of the methods except COVER_best_init() are thread safe if zstd is
- * compiled with multithreaded support.
- */
-typedef struct COVER_best_s {
-  ZSTD_pthread_mutex_t mutex;
-  ZSTD_pthread_cond_t cond;
-  size_t liveJobs;
-  void *dict;
-  size_t dictSize;
-  ZDICT_cover_params_t parameters;
-  size_t compressedSize;
-} COVER_best_t;
+
+
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                    const size_t *samplesSizes, const BYTE *samples,
+                                    size_t *offsets,
+                                    size_t nbTrainSamples, size_t nbSamples,
+                                    BYTE *const dict, size_t dictBufferCapacity) {
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Pointers */
+  ZSTD_CCtx *cctx;
+  ZSTD_CDict *cdict;
+  void *dst;
+  /* Local variables */
+  size_t dstCapacity;
+  size_t i;
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+    for (; i < nbSamples; ++i) {
+      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                           parameters.zParams.compressionLevel);
+  if (!dst || !cctx || !cdict) {
+    goto _compressCleanup;
+  }
+  /* Compress each sample and sum their sizes (or error) */
+  totalCompressedSize = dictBufferCapacity;
+  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+  for (; i < nbSamples; ++i) {
+    const size_t size = ZSTD_compress_usingCDict(
+        cctx, dst, dstCapacity, samples + offsets[i],
+        samplesSizes[i], cdict);
+    if (ZSTD_isError(size)) {
+      totalCompressedSize = ERROR(GENERIC);
+      goto _compressCleanup;
+    }
+    totalCompressedSize += size;
+  }
+_compressCleanup:
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  if (dst) {
+    free(dst);
+  }
+  return totalCompressedSize;
+}
+
 
 /**
  * Initialize the `COVER_best_t`.
  */
-static void COVER_best_init(COVER_best_t *best) {
+void COVER_best_init(COVER_best_t *best) {
   if (best==NULL) return; /* compatible with init on NULL */
   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
@@ -772,7 +802,7 @@ static void COVER_best_init(COVER_best_t *best) {
 /**
  * Wait until liveJobs == 0.
  */
-static void COVER_best_wait(COVER_best_t *best) {
+void COVER_best_wait(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -786,7 +816,7 @@ static void COVER_best_wait(COVER_best_t *best) {
 /**
  * Call COVER_best_wait() and then destroy the COVER_best_t.
  */
-static void COVER_best_destroy(COVER_best_t *best) {
+void COVER_best_destroy(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -802,7 +832,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
  * Called when a thread is about to be launched.
  * Increments liveJobs.
  */
-static void COVER_best_start(COVER_best_t *best) {
+void COVER_best_start(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -816,7 +846,7 @@ static void COVER_best_start(COVER_best_t *best) {
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  * If this dictionary is the best so far save it and its parameters.
  */
-static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
                               ZDICT_cover_params_t parameters, void *dict,
                               size_t dictSize) {
   if (!best) {
@@ -847,10 +877,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
       best->parameters = parameters;
       best->compressedSize = compressedSize;
     }
-    ZSTD_pthread_mutex_unlock(&best->mutex);
     if (liveJobs == 0) {
       ZSTD_pthread_cond_broadcast(&best->cond);
     }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
   }
 }
 
@@ -904,51 +934,10 @@ static void COVER_tryParameters(void *opaque) {
     }
   }
   /* Check total compressed size */
-  {
-    /* Pointers */
-    ZSTD_CCtx *cctx;
-    ZSTD_CDict *cdict;
-    void *dst;
-    /* Local variables */
-    size_t dstCapacity;
-    size_t i;
-    /* Allocate dst with enough space to compress the maximum sized sample */
-    {
-      size_t maxSampleSize = 0;
-      i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
-      for (; i < ctx->nbSamples; ++i) {
-        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
-      }
-      dstCapacity = ZSTD_compressBound(maxSampleSize);
-      dst = malloc(dstCapacity);
-    }
-    /* Create the cctx and cdict */
-    cctx = ZSTD_createCCtx();
-    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
-                             parameters.zParams.compressionLevel);
-    if (!dst || !cctx || !cdict) {
-      goto _compressCleanup;
-    }
-    /* Compress each sample and sum their sizes (or error) */
-    totalCompressedSize = dictBufferCapacity;
-    i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
-    for (; i < ctx->nbSamples; ++i) {
-      const size_t size = ZSTD_compress_usingCDict(
-          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
-          ctx->samplesSizes[i], cdict);
-      if (ZSTD_isError(size)) {
-        totalCompressedSize = ERROR(GENERIC);
-        goto _compressCleanup;
-      }
-      totalCompressedSize += size;
-    }
-  _compressCleanup:
-    ZSTD_freeCCtx(cctx);
-    ZSTD_freeCDict(cdict);
-    if (dst) {
-      free(dst);
-    }
-  }
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
 
 _cleanup:
   COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
diff --git a/lib/dictBuilder/cover.h b/lib/dictBuilder/cover.h
new file mode 100644
index 000000000..82e2e1cea
--- /dev/null
+++ b/lib/dictBuilder/cover.h
@@ -0,0 +1,83 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                       ZDICT_cover_params_t parameters, void *dict,
+                       size_t dictSize);
diff --git a/lib/dictBuilder/divsufsort.c b/lib/dictBuilder/divsufsort.c
index 60cceb088..ead922044 100644
--- a/lib/dictBuilder/divsufsort.c
+++ b/lib/dictBuilder/divsufsort.c
@@ -1637,7 +1637,7 @@ construct_SA(const unsigned char *T, int *SA,
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else {
           assert(((s == 0) && (T[s] == c1)) || (s < 0));
@@ -1701,7 +1701,7 @@ construct_BWT(const unsigned char *T, int *SA,
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
@@ -1785,7 +1785,7 @@ construct_BWT_indexes(const unsigned char *T, int *SA,
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c
new file mode 100644
index 000000000..6ce8c8809
--- /dev/null
+++ b/lib/dictBuilder/fastcover.c
@@ -0,0 +1,701 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "cover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurence of index in active segment */
+    if (segmentFreqs[index] == 0) {
+      activeSegment.score += freqs[index];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[index] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+
+  free(ctx->freqs);
+  ctx->freqs = NULL;
+
+  free(ctx->offsets);
+  ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void FASTCOVER_computeFrequency(U32 *freqs, FASTCOVER_ctx_t *ctx){
+  const unsigned f = ctx->f;
+  const unsigned d = ctx->d;
+  const unsigned skip = ctx->accelParams.skip;
+  const unsigned readLength = MAX(d, 8);
+  size_t start; /* start of current dmer */
+  size_t i;
+  for (i = 0; i < ctx->nbTrainSamples; i++) {
+    size_t currSampleStart = ctx->offsets[i];
+    size_t currSampleEnd = ctx->offsets[i+1];
+    start = currSampleStart;
+    while (start + readLength <= currSampleEnd) {
+      const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+      freqs[dmerIndex]++;
+      start = start + skip + 1;
+    }
+  }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer,
+                              const size_t *samplesSizes, unsigned nbSamples,
+                              unsigned d, double splitPoint, unsigned f,
+                              FASTCOVER_accel_t accelParams) {
+  const BYTE *const samples = (const BYTE *)samplesBuffer;
+  const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+  /* Checks */
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+      totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+    return 0;
+  }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+    return 0;
+  }
+  /* Zero the context */
+  memset(ctx, 0, sizeof(*ctx));
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
+
+  ctx->samples = samples;
+  ctx->samplesSizes = samplesSizes;
+  ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
+  ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->d = d;
+  ctx->f = f;
+  ctx->accelParams = accelParams;
+
+  /* The offsets of each file */
+  ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+  if (!ctx->offsets) {
+    DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
+    FASTCOVER_ctx_destroy(ctx);
+    return 0;
+  }
+
+  /* Fill offsets from the samplesSizes */
+  {
+    U32 i;
+    ctx->offsets[0] = 0;
+    for (i = 1; i <= nbSamples; ++i) {
+      ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+    }
+  }
+
+  /* Initialize frequency array of size 2^f */
+  ctx->freqs = (U32 *)calloc(((U64)1 << f), sizeof(U32));
+
+  DISPLAYLEVEL(2, "Computing frequencies\n");
+  FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+  return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs,
+                                        void *dictBuffer, size_t dictBufferCapacity,
+                                        ZDICT_cover_params_t parameters, U16* segmentFreqs){
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+  const FASTCOVER_ctx_t *ctx;
+  COVER_best_t *best;
+  size_t dictBufferCapacity;
+  ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque) {
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  {
+    const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                  parameters, segmentFreqs);
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  free(segmentFreqs);
+  free(dict);
+  free(freqs);
+}
+
+
+
+static void FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                                          ZDICT_cover_params_t *coverParams) {
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+}
+
+
+static void FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                          ZDICT_fastCover_params_t *fastCoverParams,
+                                          unsigned f, unsigned accel) {
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+}
+
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_fastCover_params_t *parameters) {
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(GENERIC);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index b8a51789a..eb1a5b669 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -698,7 +698,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
     short litLengthNCount[MaxLL+1];
     U32 repOffset[MAXREPOFFSET];
     offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
-    EStats_ress_t esr;
+    EStats_ress_t esr = { NULL, NULL, NULL };
     ZSTD_parameters params;
     U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
     size_t pos = 0, errorCode;
@@ -863,8 +863,8 @@ _cleanup:
 
 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                           const void* customDictContent, size_t dictContentSize,
-                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                          ZDICT_params_t params)
+                          const void* samplesBuffer, const size_t* samplesSizes,
+                          unsigned nbSamples, ZDICT_params_t params)
 {
     size_t hSize;
 #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
@@ -987,8 +987,10 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
             U32 const pos = dictList[u].pos;
             U32 const length = dictList[u].length;
             U32 const printedLength = MIN(40, length);
-            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
+                free(dictList);
                 return ERROR(GENERIC);   /* should never happen */
+            }
             DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
                          u, length, pos, dictList[u].savings);
             ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
@@ -1078,17 +1080,17 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
-    ZDICT_cover_params_t params;
+    ZDICT_fastCover_params_t params;
     DEBUGLOG(3, "ZDICT_trainFromBuffer");
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
     /* Default to level 6 since no compression level information is available */
-    params.zParams.compressionLevel = 6;
+    params.zParams.compressionLevel = 3;
 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
     params.zParams.notificationLevel = DEBUGLEVEL;
 #endif
-    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
                                                samplesBuffer, samplesSizes, nbSamples,
                                                &params);
 }
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h
index 4094669d1..d57d59f01 100644
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -39,7 +39,8 @@ extern "C" {
 
 /*! ZDICT_trainFromBuffer():
  *  Train a dictionary from an array of samples.
- *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
+ *  f=20, and accel=1.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  *  The resulting dictionary will be saved into `dictBuffer`.
@@ -52,7 +53,8 @@ extern "C" {
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer,
+                                    const size_t* samplesSizes, unsigned nbSamples);
 
 
 /*======   Helper functions   ======*/
@@ -84,12 +86,22 @@ typedef struct {
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
     unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
     unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
-    double splitPoint;           /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
     ZDICT_params_t zParams;
 } ZDICT_cover_params_t;
 
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
+    unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;
 
 /*! ZDICT_trainFromBuffer_cover():
  *  Train a dictionary from an array of samples using the COVER algorithm.
@@ -116,9 +128,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
  * dictionary constructed with those parameters is stored in `dictBuffer`.
  *
  * All of the parameters d, k, steps are optional.
- * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
  * if steps is zero it defaults to its default value.
- * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *           or an error code, which can be tested with ZDICT_isError().
@@ -130,6 +142,48 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
           ZDICT_cover_params_t* parameters);
 
+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  d and k are required.
+ *  All other parameters are optional, will use default values if not provided
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
+                    size_t dictBufferCapacity, const void *samplesBuffer,
+                    const size_t *samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations (specifically, k and d combinations)
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ * All of the parameters d, k, steps, f, and accel are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
+ * If f is zero, default value of 20 is used.
+ * If accel is zero, default value of 1 is used.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
+                    size_t dictBufferCapacity, const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters);
+
 /*! ZDICT_finalizeDictionary():
  * Given a custom content as a basis for dictionary, and a set of samples,
  * finalize dictionary by adding headers and statistics.
diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c
index a2e2cfa80..15000db6d 100644
--- a/lib/legacy/zstd_v04.c
+++ b/lib/legacy/zstd_v04.c
@@ -1093,6 +1093,7 @@ static size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, un
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
 
     /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSE_DECODE_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
     DTableH.tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++)
     {
diff --git a/lib/legacy/zstd_v05.c b/lib/legacy/zstd_v05.c
index a5e1b1ffc..65f07a3b5 100644
--- a/lib/legacy/zstd_v05.c
+++ b/lib/legacy/zstd_v05.c
@@ -1224,6 +1224,7 @@ size_t FSEv05_buildDTable(FSEv05_DTable* dt, const short* normalizedCounter, uns
     if (tableLog > FSEv05_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
 
     /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSEv05_FUNCTION_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
     DTableH.tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++) {
         if (normalizedCounter[s]==-1) {
@@ -2845,6 +2846,7 @@ size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t*
 
 static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
diff --git a/lib/legacy/zstd_v06.c b/lib/legacy/zstd_v06.c
index 8b068b3e5..ca982c35e 100644
--- a/lib/legacy/zstd_v06.c
+++ b/lib/legacy/zstd_v06.c
@@ -3041,6 +3041,7 @@ size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t*
 
 static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -4006,7 +4007,7 @@ size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd,
                     if (ZSTDv06_isError(hSize)) return hSize;
                     if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
                         memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
-                        zbd->lhSize += iend-ip; ip = iend; notDone = 0;
+                        zbd->lhSize += iend-ip;
                         *dstCapacityPtr = 0;
                         return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
diff --git a/lib/legacy/zstd_v07.c b/lib/legacy/zstd_v07.c
index 70b170f0f..9f95f62ad 100644
--- a/lib/legacy/zstd_v07.c
+++ b/lib/legacy/zstd_v07.c
@@ -3150,10 +3150,10 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
     const BYTE* ip = (const BYTE*)src;
 
     if (srcSize < ZSTDv07_frameHeaderSize_min) return ZSTDv07_frameHeaderSize_min;
+    memset(fparamsPtr, 0, sizeof(*fparamsPtr));
     if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) {
         if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
             if (srcSize < ZSTDv07_skippableHeaderSize) return ZSTDv07_skippableHeaderSize; /* magic number + skippable frame length */
-            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
             fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
             fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
             return 0;
@@ -3175,11 +3175,13 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
         U32 windowSize = 0;
         U32 dictID = 0;
         U64 frameContentSize = 0;
-        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if ((fhdByte & 0x08) != 0)   /* reserved bits, which must be zero */
+            return ERROR(frameParameter_unsupported);
         if (!directMode) {
             BYTE const wlByte = ip[pos++];
             U32 const windowLog = (wlByte >> 3) + ZSTDv07_WINDOWLOG_ABSOLUTEMIN;
-            if (windowLog > ZSTDv07_WINDOWLOG_MAX) return ERROR(frameParameter_unsupported);
+            if (windowLog > ZSTDv07_WINDOWLOG_MAX)
+                return ERROR(frameParameter_unsupported);
             windowSize = (1U << windowLog);
             windowSize += (windowSize >> 3) * (wlByte&7);
         }
@@ -3201,7 +3203,8 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
             case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
         }
         if (!windowSize) windowSize = (U32)frameContentSize;
-        if (windowSize > windowSizeMax) return ERROR(frameParameter_unsupported);
+        if (windowSize > windowSizeMax)
+            return ERROR(frameParameter_unsupported);
         fparamsPtr->frameContentSize = frameContentSize;
         fparamsPtr->windowSize = windowSize;
         fparamsPtr->dictID = dictID;
@@ -3220,11 +3223,10 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
                    - frame header not completely provided (`srcSize` too small) */
 unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize)
 {
-    {   ZSTDv07_frameParams fparams;
-        size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
-        if (frResult!=0) return 0;
-        return fparams.frameContentSize;
-    }
+    ZSTDv07_frameParams fparams;
+    size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
+    if (frResult!=0) return 0;
+    return fparams.frameContentSize;
 }
 
 
diff --git a/lib/zstd.h b/lib/zstd.h
index edb107c2b..25441e68c 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -35,26 +35,38 @@ extern "C" {
 #endif
 
 
-/*******************************************************************************************************
+/*******************************************************************************
   Introduction
 
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
-*********************************************************************************************************/
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
 
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
@@ -211,7 +223,8 @@ typedef struct ZSTD_CDict_s ZSTD_CDict;
  *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
  *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
  *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
- *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+ *  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
                                          int compressionLevel);
 
@@ -223,7 +236,9 @@ ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
  *  Compression using a digested Dictionary.
  *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
  *  Note that compression level is decided during dictionary creation.
- *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+ *  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+ *         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                             void* dst, size_t dstCapacity,
                                       const void* src, size_t srcSize,
@@ -1161,16 +1176,21 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
 
 /*! ZSTD_CCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Decompression need same prefix to properly regenerate data.
- *  Prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
  *           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
- *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_p_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
  *           It's a CPU consuming operation, with non-negligible impact on latency.
  *           If there is a need to use same prefix multiple times, consider loadDictionary instead.
- *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
  *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                        const void* prefix, size_t prefixSize);
@@ -1353,6 +1373,8 @@ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
 
 /*! ZSTD_DCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
  *  Prefix is **only used once**. Reference is discarded at end of frame.
  *  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
diff --git a/programs/Makefile b/programs/Makefile
index 912f9eff0..f1a96325c 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -132,6 +132,15 @@ else
 LZ4_MSG := $(NO_LZ4_MSG)
 endif
 
+# enable backtrace symbol names for Linux/Darwin
+ALL_SYMBOLS := 0
+ifeq (,$(filter Windows%, $(OS)))
+ifeq ($(ALL_SYMBOLS), 1)
+DEBUGFLAGS_LD+=-rdynamic
+endif
+endif
+
+
 .PHONY: default
 default: zstd-release
 
@@ -144,7 +153,7 @@ allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-nolegacy
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)
 
 zstd : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP) $(LZMACPP) $(LZ4CPP)
-zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD)
+zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD) $(DEBUGFLAGS_LD)
 zstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
 	@echo "$(THREAD_MSG)"
@@ -158,6 +167,7 @@ endif
 
 .PHONY: zstd-release
 zstd-release: DEBUGFLAGS :=
+zstd-release: DEBUGFLAGS_LD :=
 zstd-release: zstd
 
 zstd32 : CPPFLAGS += $(THREAD_CPP)
diff --git a/programs/README.md b/programs/README.md
index 22a00409c..804cb8b0a 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -61,6 +61,13 @@ There are however other Makefile targets that create different variations of CLI
   In which case, linking stage will fail if `lz4` library cannot be found.
   This is useful to prevent silent feature disabling.
 
+- __ALL_SYMBOLS__ : `zstd` can display a stack backtrace if the execution
+  generates a runtime exception. By default, this feature may be
+  degraded/disabled on some platforms unless additional compiler directives are
+  applied. When triaging a runtime issue, enabling this feature can provided
+  more context to determine the location of the fault.
+  Example : `make zstd ALL_SYMBOLS=1`
+
 
 #### Aggregation of parameters
 CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
@@ -151,6 +158,7 @@ Advanced arguments :
 Dictionary builder :
 --train ## : create a dictionary from a training set of files
 --train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args
+--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fastcover algorithm with optional args
 --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
  -o file : `file` is dictionary name (default: dictionary)
 --maxdict=# : limit dictionary to specified size (default: 112640)
diff --git a/programs/bench.c b/programs/bench.c
index 79ef42caa..326c1c1c5 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -63,7 +63,12 @@
 #define MB *(1 <<20)
 #define GB *(1U<<30)
 
-static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t)(1ULL << ((sizeof(size_t)*8)-31));
+#define BMK_RUNTEST_DEFAULT_MS 1000
+
+static const size_t maxMemory = (sizeof(size_t)==4)  ?
+                    /* 32-bit */ (2 GB - 64 MB) :
+                    /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t)*8)-31));
+
 
 /* *************************************
 *  console display
@@ -97,26 +102,26 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
     return errorNum;                                  \
 }
 
-#define EXM_THROW(errorNum, retType, ...)  {          \
+#define RETURN_ERROR(errorNum, retType, ...)  {       \
     retType r;                                        \
     memset(&r, 0, sizeof(retType));                   \
     DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
     DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
     DISPLAYLEVEL(1, __VA_ARGS__);                     \
     DISPLAYLEVEL(1, " \n");                           \
-    r.error = errorNum;                               \
+    r.tag = errorNum;                                 \
     return r;                                         \
 }
 
 /* error without displaying */
-#define EXM_THROW_ND(errorNum, retType, ...)  {       \
+#define RETURN_QUIET_ERROR(errorNum, retType, ...)  { \
     retType r;                                        \
     memset(&r, 0, sizeof(retType));                   \
     DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
     DEBUGOUTPUT("Error %i : ", errorNum);             \
     DEBUGOUTPUT(__VA_ARGS__);                         \
     DEBUGOUTPUT(" \n");                               \
-    r.error = errorNum;                               \
+    r.tag = errorNum;                                 \
     return r;                                         \
 }
 
@@ -124,16 +129,15 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 *  Benchmark Parameters
 ***************************************/
 
-BMK_advancedParams_t BMK_initAdvancedParams(void) { 
-    BMK_advancedParams_t res = { 
+BMK_advancedParams_t BMK_initAdvancedParams(void) {
+    BMK_advancedParams_t const res = {
         BMK_both, /* mode */
-        BMK_timeMode, /* loopMode */
         BMK_TIMETEST_DEFAULT_S, /* nbSeconds */
         0, /* blockSize */
         0, /* nbWorkers */
         0, /* realTime */
         0, /* additionalParam */
-        0, /* ldmFlag */ 
+        0, /* ldmFlag */
         0, /* ldmMinMatch */
         0, /* ldmHashLog */
         0, /* ldmBuckSizeLog */
@@ -156,20 +160,13 @@ typedef struct {
     size_t resSize;
 } blockParam_t;
 
-struct  BMK_timeState_t{
-    unsigned nbLoops;
-    U64 timeRemaining;
-    UTIL_time_t coolTime;
-    U64 fastestTime;
-};
-
 #undef MIN
 #undef MAX
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 #define MAX(a,b)    ((a) > (b) ? (a) : (b))
 
-static void BMK_initCCtx(ZSTD_CCtx* ctx, 
-    const void* dictBuffer, size_t dictBufferSize, int cLevel, 
+static void BMK_initCCtx(ZSTD_CCtx* ctx,
+    const void* dictBuffer, size_t dictBufferSize, int cLevel,
     const ZSTD_compressionParameters* comprParams, const BMK_advancedParams_t* adv) {
     ZSTD_CCtx_reset(ctx);
     ZSTD_CCtx_resetParameters(ctx);
@@ -194,15 +191,15 @@ static void BMK_initCCtx(ZSTD_CCtx* ctx,
     ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize);
 }
 
-
-static void BMK_initDCtx(ZSTD_DCtx* dctx, 
+static void BMK_initDCtx(ZSTD_DCtx* dctx,
     const void* dictBuffer, size_t dictBufferSize) {
     ZSTD_DCtx_reset(dctx);
     ZSTD_DCtx_loadDictionary(dctx, dictBuffer, dictBufferSize);
 }
 
+
 typedef struct {
-    ZSTD_CCtx* ctx;
+    ZSTD_CCtx* cctx;
     const void* dictBuffer;
     size_t dictBufferSize;
     int cLevel;
@@ -212,7 +209,7 @@ typedef struct {
 
 static size_t local_initCCtx(void* payload) {
     BMK_initCCtxArgs* ag = (BMK_initCCtxArgs*)payload;
-    BMK_initCCtx(ag->ctx, ag->dictBuffer, ag->dictBufferSize, ag->cLevel, ag->comprParams, ag->adv);
+    BMK_initCCtx(ag->cctx, ag->dictBuffer, ag->dictBufferSize, ag->cLevel, ag->comprParams, ag->adv);
     return 0;
 }
 
@@ -228,26 +225,24 @@ static size_t local_initDCtx(void* payload) {
     return 0;
 }
 
-/* additional argument is just the context */
+
+/* `addArgs` is the context */
 static size_t local_defaultCompress(
-    const void* srcBuffer, size_t srcSize, 
-    void* dstBuffer, size_t dstSize, 
-    void* addArgs) {
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstSize,
+                    void* addArgs)
+{
     size_t moreToFlush = 1;
-    ZSTD_CCtx* ctx = (ZSTD_CCtx*)addArgs;
+    ZSTD_CCtx* const cctx = (ZSTD_CCtx*)addArgs;
     ZSTD_inBuffer in;
     ZSTD_outBuffer out;
-    in.src = srcBuffer;
-    in.size = srcSize;
-    in.pos = 0;
-    out.dst = dstBuffer;
-    out.size = dstSize;
-    out.pos = 0;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstSize; out.pos = 0;
     while (moreToFlush) {
         if(out.pos == out.size) {
             return (size_t)-ZSTD_error_dstSize_tooSmall;
         }
-        moreToFlush = ZSTD_compress_generic(ctx, &out, &in, ZSTD_e_end);
+        moreToFlush = ZSTD_compress_generic(cctx, &out, &in, ZSTD_e_end);
         if (ZSTD_isError(moreToFlush)) {
             return moreToFlush;
         }
@@ -255,27 +250,23 @@ static size_t local_defaultCompress(
     return out.pos;
 }
 
-/* additional argument is just the context */
+/* `addArgs` is the context */
 static size_t local_defaultDecompress(
-    const void* srcBuffer, size_t srcSize, 
-    void* dstBuffer, size_t dstSize, 
-    void* addArgs) {
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstCapacity,
+                    void* addArgs)
+{
     size_t moreToFlush = 1;
-    ZSTD_DCtx* dctx = (ZSTD_DCtx*)addArgs;
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*)addArgs;
     ZSTD_inBuffer in;
     ZSTD_outBuffer out;
-    in.src = srcBuffer;
-    in.size = srcSize;
-    in.pos = 0;
-    out.dst = dstBuffer;
-    out.size = dstSize;
-    out.pos = 0;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstCapacity; out.pos = 0;
     while (moreToFlush) {
         if(out.pos == out.size) {
             return (size_t)-ZSTD_error_dstSize_tooSmall;
         }
-        moreToFlush = ZSTD_decompress_generic(dctx,
-                            &out, &in);
+        moreToFlush = ZSTD_decompress_generic(dctx, &out, &in);
         if (ZSTD_isError(moreToFlush)) {
             return moreToFlush;
         }
@@ -284,198 +275,307 @@ static size_t local_defaultDecompress(
 
 }
 
-/* initFn will be measured once, bench fn will be measured x times */
-/* benchFn should return error value or out Size */
-/* takes # of blocks and list of size & stuff for each. */
-/* only does looping */
-/* note time per loop could be zero if interval too short */
-BMK_customReturn_t BMK_benchFunction(
-    BMK_benchFn_t benchFn, void* benchPayload,
-    BMK_initFn_t initFn, void* initPayload,
-    size_t blockCount,
-    const void* const * const srcBlockBuffers, const size_t* srcBlockSizes,
-    void* const * const dstBlockBuffers, const size_t* dstBlockCapacities, size_t* blockResult, 
-    unsigned nbLoops) {
-    size_t dstSize = 0;
-    U64 totalTime;
 
-    BMK_customReturn_t retval;
-    UTIL_time_t clockStart;
+/*===  Benchmarking an arbitrary function  ===*/
+
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)
+{
+    return outcome.tag == 0;
+}
+
+/* warning : this function will stop program execution if outcome is invalid !
+ *           check outcome validity first, using BMK_isValid_runResult() */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_runOutcome_t BMK_runOutcome_error(void)
+{
+    BMK_runOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
+{
+    BMK_runOutcome_t outcome;
+    outcome.tag = 0;
+    outcome.internal_never_use_directly = runTime;
+    return outcome;
+}
+
+
+/* initFn will be measured once, benchFn will be measured `nbLoops` times */
+/* initFn is optional, provide NULL if none */
+/* benchFn must return size_t field compliant with ZSTD_isError for error valuee */
+/* takes # of blocks and list of size & stuff for each. */
+/* can report result of benchFn for each block into blockResult. */
+/* blockResult is optional, provide NULL if this information is not required */
+/* note : time per loop could be zero if run time < timer resolution */
+BMK_runOutcome_t BMK_benchFunction(
+            BMK_benchFn_t benchFn, void* benchPayload,
+            BMK_initFn_t initFn, void* initPayload,
+            size_t blockCount,
+            const void* const * srcBlockBuffers, const size_t* srcBlockSizes,
+            void* const * dstBlockBuffers, const size_t* dstBlockCapacities,
+            size_t* blockResults,
+            unsigned nbLoops)
+{
+    size_t dstSize = 0;
 
     if(!nbLoops) {
-        EXM_THROW_ND(1, BMK_customReturn_t, "nbLoops must be nonzero \n");
+        RETURN_QUIET_ERROR(2, BMK_runOutcome_t, "nbLoops must be nonzero ");
     }
 
-    {
-        size_t i;
+    /* init */
+    {   size_t i;
         for(i = 0; i < blockCount; i++) {
             memset(dstBlockBuffers[i], 0xE5, dstBlockCapacities[i]);  /* warm up and erase result buffer */
         }
 #if 0
-        /* based on testing these seem to lower accuracy of multiple calls of 1 nbLoops vs 1 call of multiple nbLoops 
-         * (Makes former slower)   
+        /* based on testing these seem to lower accuracy of multiple calls of 1 nbLoops vs 1 call of multiple nbLoops
+         * (Makes former slower)
          */
         UTIL_sleepMilli(5);  /* give processor time to other processes */
         UTIL_waitForNextTick();
 #endif
     }
 
-    {      
-        unsigned i, j;
-        clockStart = UTIL_getTime();
-        if(initFn != NULL) { initFn(initPayload); }
-        for(i = 0; i < nbLoops; i++) {
-            for(j = 0; j < blockCount; j++) {
-                size_t res = benchFn(srcBlockBuffers[j], srcBlockSizes[j], dstBlockBuffers[j], dstBlockCapacities[j], benchPayload);
+    /* benchmark */
+    {   UTIL_time_t const clockStart = UTIL_getTime();
+        unsigned loopNb, blockNb;
+        if (initFn != NULL) initFn(initPayload);
+        for (loopNb = 0; loopNb < nbLoops; loopNb++) {
+            for (blockNb = 0; blockNb < blockCount; blockNb++) {
+                size_t const res = benchFn(srcBlockBuffers[blockNb], srcBlockSizes[blockNb],
+                                    dstBlockBuffers[blockNb], dstBlockCapacities[blockNb],
+                                    benchPayload);
                 if(ZSTD_isError(res)) {
-                    EXM_THROW_ND(2, BMK_customReturn_t, "Function benchmarking failed on block %u of size %u : %s  \n",
-                        j, (U32)dstBlockCapacities[j], ZSTD_getErrorName(res));
-                } else if(i == nbLoops - 1) {
+                    RETURN_QUIET_ERROR(2, BMK_runOutcome_t,
+                        "Function benchmark failed on block %u of size %u : %s",
+                        blockNb, (U32)dstBlockCapacities[blockNb], ZSTD_getErrorName(res));
+                } else if (loopNb == 0) {
                     dstSize += res;
-                    if(blockResult != NULL) {
-                        blockResult[j] = res;
-                    }
-                } 
-            }
-        }
-        totalTime = UTIL_clockSpanNano(clockStart);
-    }
+                    if (blockResults != NULL) blockResults[blockNb] = res;
+            }   }
+        }  /* for (loopNb = 0; loopNb < nbLoops; loopNb++) */
 
-    retval.error = 0;
-    retval.result.nanoSecPerRun = totalTime / nbLoops;
-    retval.result.sumOfReturn = dstSize;
-    return retval;
-} 
-
-#define MINUSABLETIME 500000000ULL /* 0.5 seconds in ns */
-
-void BMK_resetTimeState(BMK_timedFnState_t* r, unsigned nbSeconds) {
-    r->nbLoops = 1;
-    r->timeRemaining = (U64)nbSeconds * TIMELOOP_NANOSEC;
-    r->coolTime = UTIL_getTime();
-    r->fastestTime = (U64)(-1LL);
+        {   U64 const totalTime = UTIL_clockSpanNano(clockStart);
+            BMK_runTime_t rt;
+            rt.nanoSecPerRun = totalTime / nbLoops;
+            rt.sumOfReturn = dstSize;
+            return BMK_setValid_runTime(rt);
+    }   }
 }
 
-BMK_timedFnState_t* BMK_createTimeState(unsigned nbSeconds) {
-    BMK_timedFnState_t* r = (BMK_timedFnState_t*)malloc(sizeof(struct BMK_timeState_t));
-    if(r == NULL) {
-        return r;
-    }
-    BMK_resetTimeState(r, nbSeconds);
+
+/* ====  Benchmarking any function, providing intermediate results  ==== */
+
+struct BMK_timedFnState_s {
+    U64 timeSpent_ns;
+    U64 timeBudget_ns;
+    U64 runBudget_ns;
+    BMK_runTime_t fastestRun;
+    unsigned nbLoops;
+    UTIL_time_t coolTime;
+};  /* typedef'd to BMK_timedFnState_t within bench.h */
+
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
+{
+    BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
+    if (r == NULL) return NULL;   /* malloc() error */
+    BMK_resetTimedFnState(r, total_ms, run_ms);
     return r;
 }
 
-void BMK_freeTimeState(BMK_timedFnState_t* state) {
+void BMK_freeTimedFnState(BMK_timedFnState_t* state) {
     free(state);
 }
 
-/* make option for dstBlocks to be */
-BMK_customTimedReturn_t BMK_benchFunctionTimed(
-    BMK_timedFnState_t* cont,
-    BMK_benchFn_t benchFn, void* benchPayload,
-    BMK_initFn_t initFn, void* initPayload,
-    size_t blockCount,
-    const void* const* const srcBlockBuffers, const size_t* srcBlockSizes,
-    void * const * const dstBlockBuffers, const size_t * dstBlockCapacities, size_t* blockResults) 
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
 {
-    U64 fastest = cont->fastestTime;
-    int completed = 0;
-    BMK_customTimedReturn_t r;
-    r.completed = 0;
+    if (!total_ms) total_ms = 1 ;
+    if (!run_ms) run_ms = 1;
+    if (run_ms > total_ms) run_ms = total_ms;
+    timedFnState->timeSpent_ns = 0;
+    timedFnState->timeBudget_ns = (U64)total_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->runBudget_ns = (U64)run_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->fastestRun.nanoSecPerRun = (U64)(-1LL);
+    timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
+    timedFnState->nbLoops = 1;
+    timedFnState->coolTime = UTIL_getTime();
+}
+
+/* Tells if nb of seconds set in timedFnState for all runs is spent.
+ * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState)
+{
+    return (timedFnState->timeSpent_ns >= timedFnState->timeBudget_ns);
+}
+
+
+#define MINUSABLETIME  (TIMELOOP_NANOSEC / 2)  /* 0.5 seconds */
+
+BMK_runOutcome_t BMK_benchTimedFn(
+            BMK_timedFnState_t* cont,
+            BMK_benchFn_t benchFn, void* benchPayload,
+            BMK_initFn_t initFn, void* initPayload,
+            size_t blockCount,
+            const void* const* srcBlockBuffers, const size_t* srcBlockSizes,
+            void * const * dstBlockBuffers, const size_t * dstBlockCapacities,
+            size_t* blockResults)
+{
+    U64 const runBudget_ns = cont->runBudget_ns;
+    U64 const runTimeMin_ns = runBudget_ns / 2;
+    int completed = 0;
+    BMK_runTime_t bestRunTime = cont->fastestRun;
+
+    while (!completed) {
+        BMK_runOutcome_t runResult;
 
-    while(!r.completed && !completed)
-    {
         /* Overheat protection */
         if (UTIL_clockSpanMicro(cont->coolTime) > ACTIVEPERIOD_MICROSEC) {
             DEBUGOUTPUT("\rcooling down ...    \r");
             UTIL_sleep(COOLPERIOD_SEC);
             cont->coolTime = UTIL_getTime();
         }
+
         /* reinitialize capacity */
-        r.result = BMK_benchFunction(benchFn, benchPayload, initFn, initPayload,
-        blockCount, srcBlockBuffers, srcBlockSizes, dstBlockBuffers, dstBlockCapacities, blockResults, cont->nbLoops);
-        if(r.result.error) { /* completed w/ error */
-            r.completed = 1;
-            return r;
+        runResult = BMK_benchFunction(benchFn, benchPayload,
+                                    initFn, initPayload,
+                                    blockCount,
+                                    srcBlockBuffers, srcBlockSizes,
+                                    dstBlockBuffers, dstBlockCapacities,
+                                    blockResults,
+                                    cont->nbLoops);
+
+        if(!BMK_isSuccessful_runOutcome(runResult)) { /* error : move out */
+            return BMK_runOutcome_error();
         }
 
-        {   U64 const loopDuration = r.result.result.nanoSecPerRun * cont->nbLoops;
-            r.completed = (cont->timeRemaining <= loopDuration);
-            cont->timeRemaining -= loopDuration;
-            if (loopDuration > (TIMELOOP_NANOSEC / 100)) { 
-                fastest = MIN(fastest, r.result.result.nanoSecPerRun);
-                if(loopDuration >= MINUSABLETIME) {
-                    r.result.result.nanoSecPerRun = fastest;
-                    cont->fastestTime = fastest;
-                }
-                cont->nbLoops = (U32)(TIMELOOP_NANOSEC / r.result.result.nanoSecPerRun) + 1;
+        {   BMK_runTime_t const newRunTime = BMK_extract_runTime(runResult);
+            U64 const loopDuration_ns = newRunTime.nanoSecPerRun * cont->nbLoops;
+
+            cont->timeSpent_ns += loopDuration_ns;
+
+            /* estimate nbLoops for next run to last approximately 1 second */
+            if (loopDuration_ns > (runBudget_ns / 50)) {
+                U64 const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
+                cont->nbLoops = (U32)(runBudget_ns / fastestRun_ns) + 1;
             } else {
-                const unsigned multiplier = 2;
+                /* previous run was too short : blindly increase workload by x multiplier */
+                const unsigned multiplier = 10;
                 assert(cont->nbLoops < ((unsigned)-1) / multiplier);  /* avoid overflow */
                 cont->nbLoops *= multiplier;
             }
-            if(loopDuration < MINUSABLETIME) { /* don't report results which have time too low */
-                continue;
-            }
 
+            if(loopDuration_ns < runTimeMin_ns) {
+                /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
+                assert(completed == 0);
+                continue;
+            } else {
+                if(newRunTime.nanoSecPerRun < bestRunTime.nanoSecPerRun) {
+                    bestRunTime = newRunTime;
+                }
+                completed = 1;
+            }
         }
-        completed = 1;
-    }
-    return r;
+    }   /* while (!completed) */
+
+    return BMK_setValid_runTime(bestRunTime);
 }
 
-/* benchMem with no allocation */
-static BMK_return_t BMK_benchMemAdvancedNoAlloc(
-    const void ** const srcPtrs, size_t* const srcSizes,
-    void** const cPtrs, size_t* const cCapacities, size_t* const cSizes, 
-    void** const resPtrs, size_t* const resSizes,
-    void** resultBufferPtr, void* compressedBuffer,
-    const size_t maxCompressedSize,
-    BMK_timedFnState_t* timeStateCompress, BMK_timedFnState_t* timeStateDecompress,
 
-    const void* srcBuffer, size_t srcSize,
-    const size_t* fileSizes, unsigned nbFiles,
-    const int cLevel, const ZSTD_compressionParameters* comprParams,
-    const void* dictBuffer, size_t dictBufferSize,
-    ZSTD_CCtx* ctx, ZSTD_DCtx* dctx,
-    int displayLevel, const char* displayName, const BMK_advancedParams_t* adv) 
+/* ================================================================= */
+/*      Benchmark Zstandard, mem-to-mem scenarios                    */
+/* ================================================================= */
+
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome)
 {
-    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize); /* avoid div by 0 */
-    BMK_return_t results = { { 0, 0, 0, 0 }, 0 } ;
+    return outcome.tag == 0;
+}
+
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_error(void)
+{
+    BMK_benchOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(BMK_benchResult_t result)
+{
+    BMK_benchOutcome_t b;
+    b.tag = 0;
+    b.internal_never_use_directly = result;
+    return b;
+}
+
+
+/* benchMem with no allocation */
+static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
+            const void** srcPtrs, size_t* srcSizes,
+            void** cPtrs, size_t* cCapacities, size_t* cSizes,
+            void** resPtrs, size_t* resSizes,
+            void** resultBufferPtr, void* compressedBuffer,
+            size_t maxCompressedSize,
+            BMK_timedFnState_t* timeStateCompress,
+            BMK_timedFnState_t* timeStateDecompress,
+
+            const void* srcBuffer, size_t srcSize,
+            const size_t* fileSizes, unsigned nbFiles,
+            const int cLevel, const ZSTD_compressionParameters* comprParams,
+            const void* dictBuffer, size_t dictBufferSize,
+            ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
+            int displayLevel, const char* displayName,
+            const BMK_advancedParams_t* adv)
+{
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize);  /* avoid div by 0 */
+    BMK_benchResult_t benchResult;
     size_t const loadedCompressedSize = srcSize;
     size_t cSize = 0;
     double ratio = 0.;
     U32 nbBlocks;
 
-    if(!ctx || !dctx) 
-        EXM_THROW(31, BMK_return_t, "error: passed in null context");
+    assert(cctx != NULL); assert(dctx != NULL);
 
     /* init */
-    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* display last 17 characters */
+    memset(&benchResult, 0, sizeof(benchResult));
+    if (strlen(displayName)>17) displayName += strlen(displayName) - 17;   /* display last 17 characters */
     if (adv->mode == BMK_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
         const char* srcPtr = (const char*)srcBuffer;
         U64 totalDSize64 = 0;
         U32 fileNb;
         for (fileNb=0; fileNb<nbFiles; fileNb++) {
             U64 const fSize64 = ZSTD_findDecompressedSize(srcPtr, fileSizes[fileNb]);
-            if (fSize64==0) EXM_THROW(32, BMK_return_t, "Impossible to determine original size ");
+            if (fSize64==0) RETURN_ERROR(32, BMK_benchOutcome_t, "Impossible to determine original size ");
             totalDSize64 += fSize64;
             srcPtr += fileSizes[fileNb];
         }
         {   size_t const decodedSize = (size_t)totalDSize64;
+            assert((U64)decodedSize == totalDSize64);   /* check overflow */
             free(*resultBufferPtr);
             *resultBufferPtr = malloc(decodedSize);
             if (!(*resultBufferPtr)) {
-                EXM_THROW(33, BMK_return_t, "not enough memory"); 
+                RETURN_ERROR(33, BMK_benchOutcome_t, "not enough memory");
             }
-            if (totalDSize64 > decodedSize) {
-                free(*resultBufferPtr); 
-                EXM_THROW(32, BMK_return_t, "original size is too large");   /* size_t overflow */
+            if (totalDSize64 > decodedSize) {  /* size_t overflow */
+                free(*resultBufferPtr);
+                RETURN_ERROR(32, BMK_benchOutcome_t, "original size is too large");
             }
             cSize = srcSize;
             srcSize = decodedSize;
             ratio = (double)srcSize / (double)cSize;
-        }   
+        }
     }
 
     /* Init data blocks  */
@@ -489,21 +589,21 @@ static BMK_return_t BMK_benchMemAdvancedNoAlloc(
             U32 const blockEnd = nbBlocks + nbBlocksforThisFile;
             for ( ; nbBlocks<blockEnd; nbBlocks++) {
                 size_t const thisBlockSize = MIN(remaining, blockSize);
-                srcPtrs[nbBlocks] = (const void*)srcPtr;
+                srcPtrs[nbBlocks] = srcPtr;
                 srcSizes[nbBlocks] = thisBlockSize;
-                cPtrs[nbBlocks] = (void*)cPtr;
+                cPtrs[nbBlocks] = cPtr;
                 cCapacities[nbBlocks] = (adv->mode == BMK_decodeOnly) ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
-                resPtrs[nbBlocks] = (void*)resPtr;
+                resPtrs[nbBlocks] = resPtr;
                 resSizes[nbBlocks] = (adv->mode == BMK_decodeOnly) ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
                 srcPtr += thisBlockSize;
                 cPtr += cCapacities[nbBlocks];
                 resPtr += thisBlockSize;
                 remaining -= thisBlockSize;
-            }   
-        }   
+            }
+        }
     }
 
-    /* warmimg up memory */
+    /* warmimg up `compressedBuffer` */
     if (adv->mode == BMK_decodeOnly) {
         memcpy(compressedBuffer, srcBuffer, loadedCompressedSize);
     } else {
@@ -511,151 +611,105 @@ static BMK_return_t BMK_benchMemAdvancedNoAlloc(
     }
 
     /* Bench */
-    {   
-        U64 const crcOrig = (adv->mode == BMK_decodeOnly) ? 0 : XXH64(srcBuffer, srcSize, 0);
+    {   U64 const crcOrig = (adv->mode == BMK_decodeOnly) ? 0 : XXH64(srcBuffer, srcSize, 0);
 #       define NB_MARKS 4
-        const char* const marks[NB_MARKS] = { " |", " /", " =",  "\\" };
+        const char* marks[NB_MARKS] = { " |", " /", " =", " \\" };
         U32 markNb = 0;
-        DISPLAYLEVEL(2, "\r%79s\r", "");
+        int compressionCompleted = (adv->mode == BMK_decodeOnly);
+        int decompressionCompleted = (adv->mode == BMK_compressOnly);
+        BMK_initCCtxArgs cctxprep;
+        BMK_initDCtxArgs dctxprep;
+        cctxprep.cctx = cctx;
+        cctxprep.dictBuffer = dictBuffer;
+        cctxprep.dictBufferSize = dictBufferSize;
+        cctxprep.cLevel = cLevel;
+        cctxprep.comprParams = comprParams;
+        cctxprep.adv = adv;
+        dctxprep.dctx = dctx;
+        dctxprep.dictBuffer = dictBuffer;
+        dctxprep.dictBufferSize = dictBufferSize;
 
+        DISPLAYLEVEL(2, "\r%70s\r", "");   /* blank line */
         DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
-        {
-            BMK_initCCtxArgs cctxprep;
-            BMK_initDCtxArgs dctxprep;
-            cctxprep.ctx = ctx;
-            cctxprep.dictBuffer = dictBuffer;
-            cctxprep.dictBufferSize = dictBufferSize;
-            cctxprep.cLevel = cLevel;
-            cctxprep.comprParams = comprParams;
-            cctxprep.adv = adv;
-            dctxprep.dctx = dctx;
-            dctxprep.dictBuffer = dictBuffer;
-            dctxprep.dictBufferSize = dictBufferSize;
-            if(adv->loopMode == BMK_timeMode) {
-                BMK_customTimedReturn_t intermediateResultCompress;
-                BMK_customTimedReturn_t intermediateResultDecompress;
-                if(adv->mode == BMK_compressOnly) {
-                    intermediateResultCompress.completed = 0;
-                    intermediateResultDecompress.completed = 1;
-                } else if (adv->mode == BMK_decodeOnly) {
-                    intermediateResultCompress.completed = 1;
-                    intermediateResultDecompress.completed = 0;
-                } else { /* both */
-                    intermediateResultCompress.completed = 0;
-                    intermediateResultDecompress.completed = 0;
-                }
-                while(!(intermediateResultCompress.completed && intermediateResultDecompress.completed)) {
-                    if(!intermediateResultCompress.completed) { 
-                        intermediateResultCompress = BMK_benchFunctionTimed(timeStateCompress, &local_defaultCompress, (void*)ctx, &local_initCCtx, (void*)&cctxprep,
-                        nbBlocks, srcPtrs, srcSizes, cPtrs, cCapacities, cSizes);
-                        if(intermediateResultCompress.result.error) {
-                            results.error = intermediateResultCompress.result.error;
-                            return results;
-                        }
-                        ratio = (double)(srcSize / intermediateResultCompress.result.result.sumOfReturn);
-                        {   
-                            int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                            results.result.cSpeed = (srcSize * TIMELOOP_NANOSEC / intermediateResultCompress.result.result.nanoSecPerRun);
-                            cSize = intermediateResultCompress.result.result.sumOfReturn;
-                            results.result.cSize = cSize;
-                            ratio = (double)srcSize / results.result.cSize;
-                            markNb = (markNb+1) % NB_MARKS;
-                            DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
-                                    marks[markNb], displayName, (U32)srcSize, (U32)results.result.cSize,
-                                    ratioAccuracy, ratio,
-                                    results.result.cSpeed < (10 MB) ? 2 : 1, (double)results.result.cSpeed / (1 MB));
-                        }
-                    }
 
-                    if(!intermediateResultDecompress.completed) {
-                        intermediateResultDecompress = BMK_benchFunctionTimed(timeStateDecompress, &local_defaultDecompress, (void*)(dctx), &local_initDCtx, (void*)&dctxprep,
-                        nbBlocks, (const void* const*)cPtrs, cSizes, resPtrs, resSizes, NULL);
-                        if(intermediateResultDecompress.result.error) {
-                            results.error = intermediateResultDecompress.result.error;
-                            return results;
-                        }
-      
-                        {   
-                            int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                            results.result.dSpeed = (srcSize * TIMELOOP_NANOSEC/ intermediateResultDecompress.result.result.nanoSecPerRun);
-                            markNb = (markNb+1) % NB_MARKS;
-                            DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
-                                    marks[markNb], displayName, (U32)srcSize, (U32)results.result.cSize,
-                                    ratioAccuracy, ratio,
-                                    results.result.cSpeed < (10 MB) ? 2 : 1, (double)results.result.cSpeed / (1 MB),
-                                    (double)results.result.dSpeed / (1 MB));
-                        }
-                    }
+        while (!(compressionCompleted && decompressionCompleted)) {
+
+            if (!compressionCompleted) {
+                BMK_runOutcome_t const cOutcome =
+                        BMK_benchTimedFn( timeStateCompress,
+                                        &local_defaultCompress, cctx,
+                                        &local_initCCtx, &cctxprep,
+                                        nbBlocks,
+                                        srcPtrs, srcSizes,
+                                        cPtrs, cCapacities,
+                                        cSizes);
+
+                if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                    return BMK_benchOutcome_error();
                 }
 
-            } else { //iterMode;
-                if(adv->mode != BMK_decodeOnly) {
+                {   BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
+                    cSize = cResult.sumOfReturn;
+                    ratio = (double)srcSize / cSize;
+                    {   BMK_benchResult_t newResult;
+                        newResult.cSpeed = ((U64)srcSize * TIMELOOP_NANOSEC / cResult.nanoSecPerRun);
+                        benchResult.cSize = cSize;
+                        if (newResult.cSpeed > benchResult.cSpeed)
+                            benchResult.cSpeed = newResult.cSpeed;
+                }   }
 
-                    BMK_customReturn_t compressionResults = BMK_benchFunction(&local_defaultCompress, (void*)ctx, &local_initCCtx, (void*)&cctxprep,
-                        nbBlocks, srcPtrs, srcSizes, cPtrs, cCapacities, cSizes, adv->nbSeconds); 
-                    if(compressionResults.error) {
-                        results.error = compressionResults.error;
-                        return results;
-                    }
-
-                    if(compressionResults.result.nanoSecPerRun == 0) {
-                        results.result.cSpeed = 0;
-                    } else {
-                        results.result.cSpeed = srcSize * TIMELOOP_NANOSEC / compressionResults.result.nanoSecPerRun;
-                    }
-
-                    results.result.cSize = compressionResults.result.sumOfReturn;
-                    {   
-                        int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                        cSize = compressionResults.result.sumOfReturn;
-                        results.result.cSize = cSize;
-                        ratio = (double)srcSize / results.result.cSize;
-                        markNb = (markNb+1) % NB_MARKS;
-                        DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
-                                marks[markNb], displayName, (U32)srcSize, (U32)results.result.cSize,
-                                ratioAccuracy, ratio,
-                                results.result.cSpeed < (10 MB) ? 2 : 1, (double)results.result.cSpeed / (1 MB));
-                    }
-                }
-                if(adv->mode != BMK_compressOnly) {
-                    BMK_customReturn_t decompressionResults = BMK_benchFunction(
-                        &local_defaultDecompress, (void*)(dctx),
-                        &local_initDCtx, (void*)&dctxprep, nbBlocks,
-                        (const void* const*)cPtrs, cSizes, resPtrs, resSizes, NULL, 
-                        adv->nbSeconds);
-                    if(decompressionResults.error) {
-                        results.error = decompressionResults.error;
-                        return results;
-                    }
-
-                    if(decompressionResults.result.nanoSecPerRun == 0) {
-                        results.result.dSpeed = 0;
-                    } else {
-                        results.result.dSpeed = srcSize * TIMELOOP_NANOSEC / decompressionResults.result.nanoSecPerRun;
-                    }
-
-                    {   
-                        int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                        markNb = (markNb+1) % NB_MARKS;
-                        DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
-                                marks[markNb], displayName, (U32)srcSize, (U32)results.result.cSize,
-                                ratioAccuracy, ratio,
-                                results.result.cSpeed < (10 MB) ? 2 : 1, (double)results.result.cSpeed / (1 MB),
-                                (double)results.result.dSpeed / (1 MB));
-                    }
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    markNb = (markNb+1) % NB_MARKS;
+                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
+                            marks[markNb], displayName,
+                            (U32)srcSize, (U32)cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
                 }
+                compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
             }
-        }
+
+            if(!decompressionCompleted) {
+                BMK_runOutcome_t const dOutcome =
+                        BMK_benchTimedFn(timeStateDecompress,
+                                        &local_defaultDecompress, dctx,
+                                        &local_initDCtx, &dctxprep,
+                                        nbBlocks,
+                                        (const void *const *)cPtrs, cSizes,
+                                        resPtrs, resSizes,
+                                        NULL);
+
+                if(!BMK_isSuccessful_runOutcome(dOutcome)) {
+                    return BMK_benchOutcome_error();
+                }
+
+                {   BMK_runTime_t const dResult = BMK_extract_runTime(dOutcome);
+                    U64 const newDSpeed = (srcSize * TIMELOOP_NANOSEC / dResult.nanoSecPerRun);
+                    if (newDSpeed > benchResult.dSpeed)
+                        benchResult.dSpeed = newDSpeed;
+                }
+
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    markNb = (markNb+1) % NB_MARKS;
+                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
+                            marks[markNb], displayName,
+                            (U32)srcSize, (U32)benchResult.cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
+                            (double)benchResult.dSpeed / MB_UNIT);
+                }
+                decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+            }
+        }   /* while (!(compressionCompleted && decompressionCompleted)) */
 
         /* CRC Checking */
-        {   void* resultBuffer = *resultBufferPtr;
+        {   const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr);
             U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
-            /* adv->mode == 0 -> compress + decompress */
             if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) {
                 size_t u;
                 DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
                 for (u=0; u<srcSize; u++) {
-                    if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u]) {
+                    if (((const BYTE*)srcBuffer)[u] != resultBuffer[u]) {
                         U32 segNb, bNb, pos;
                         size_t bacc = 0;
                         DISPLAY("Decoding error at pos %u ", (U32)u);
@@ -674,44 +728,47 @@ static BMK_return_t BMK_benchMemAdvancedNoAlloc(
                             for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
                             DISPLAY(" \n");
                             DISPLAY("decode: ");
-                            for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)resultBuffer)[u+n]);
-                            DISPLAY(" :%02X:  ", ((const BYTE*)resultBuffer)[u]);
-                            for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)resultBuffer)[u+n]);
+                            for (n=-5; n<0; n++) DISPLAY("%02X ", resultBuffer[u+n]);
+                            DISPLAY(" :%02X:  ", resultBuffer[u]);
+                            for (n=1; n<3; n++) DISPLAY("%02X ", resultBuffer[u+n]);
                             DISPLAY(" \n");
                         }
                         break;
                     }
                     if (u==srcSize-1) {  /* should never happen */
                         DISPLAY("no difference detected\n");
-                    }   
+                    }
                 }
-            }   
+            }
         }   /* CRC Checking */
 
-    if (displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
-        double const cSpeed = (double)results.result.cSpeed / (1 MB);
-        double const dSpeed = (double)results.result.dSpeed / (1 MB);
-        if (adv->additionalParam) {
-            DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
-        } else {
-            DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
+        if (displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
+            double const cSpeed = (double)benchResult.cSpeed / MB_UNIT;
+            double const dSpeed = (double)benchResult.dSpeed / MB_UNIT;
+            if (adv->additionalParam) {
+                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
+            } else {
+                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
+            }
         }
-    }
-    DISPLAYLEVEL(2, "%2i#\n", cLevel);
+
+        DISPLAYLEVEL(2, "%2i#\n", cLevel);
     }   /* Bench */
-    results.result.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(ctx);
-    results.error = 0;
-    return results;
+
+    benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
+    return BMK_benchOutcome_setValidResult(benchResult);
 }
 
-BMK_return_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
-                        void* dstBuffer, size_t dstCapacity, 
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
                         const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                         const void* dictBuffer, size_t dictBufferSize,
                         int displayLevel, const char* displayName, const BMK_advancedParams_t* adv)
 
 {
+    int const dstParamsError = !dstBuffer ^ !dstCapacity;  /* must be both NULL or none */
+
     size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
     U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
 
@@ -727,71 +784,78 @@ BMK_return_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
     void ** const resPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
     size_t* const resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
 
-    BMK_timedFnState_t* timeStateCompress = BMK_createTimeState(adv->nbSeconds);
-    BMK_timedFnState_t* timeStateDecompress = BMK_createTimeState(adv->nbSeconds);
+    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
 
-    ZSTD_CCtx* ctx = ZSTD_createCCtx();
-    ZSTD_DCtx* dctx = ZSTD_createDCtx();
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
 
     const size_t maxCompressedSize = dstCapacity ? dstCapacity : ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);
 
     void* const internalDstBuffer = dstBuffer ? NULL : malloc(maxCompressedSize);
     void* const compressedBuffer = dstBuffer ? dstBuffer : internalDstBuffer;
 
-    BMK_return_t results = { { 0, 0, 0, 0 }, 0 };
-
-    int parametersConflict = !dstBuffer ^ !dstCapacity;
+    BMK_benchOutcome_t outcome = BMK_benchOutcome_error();  /* error by default */
 
     void* resultBuffer = srcSize ? malloc(srcSize) : NULL;
 
-    int allocationincomplete = !srcPtrs || !srcSizes || !cPtrs || 
-        !cSizes || !cCapacities || !resPtrs || !resSizes || 
-        !timeStateCompress || !timeStateDecompress || !compressedBuffer || !resultBuffer;
+    int allocationincomplete = !srcPtrs || !srcSizes || !cPtrs ||
+        !cSizes || !cCapacities || !resPtrs || !resSizes ||
+        !timeStateCompress || !timeStateDecompress ||
+        !cctx || !dctx ||
+        !compressedBuffer || !resultBuffer;
 
 
-
-    if (!allocationincomplete && !parametersConflict) {
-        results = BMK_benchMemAdvancedNoAlloc(srcPtrs, srcSizes, cPtrs, cCapacities, cSizes,
-            resPtrs, resSizes, &resultBuffer, compressedBuffer, maxCompressedSize, timeStateCompress, timeStateDecompress,
-            srcBuffer, srcSize, fileSizes, nbFiles, cLevel, comprParams,
-            dictBuffer, dictBufferSize, ctx, dctx, displayLevel, displayName, adv);
+    if (!allocationincomplete && !dstParamsError) {
+        outcome = BMK_benchMemAdvancedNoAlloc(srcPtrs, srcSizes,
+                                            cPtrs, cCapacities, cSizes,
+                                            resPtrs, resSizes,
+                                            &resultBuffer,
+                                            compressedBuffer, maxCompressedSize,
+                                            timeStateCompress, timeStateDecompress,
+                                            srcBuffer, srcSize,
+                                            fileSizes, nbFiles,
+                                            cLevel, comprParams,
+                                            dictBuffer, dictBufferSize,
+                                            cctx, dctx,
+                                            displayLevel, displayName, adv);
     }
-    
+
     /* clean up */
-    BMK_freeTimeState(timeStateCompress);
-    BMK_freeTimeState(timeStateDecompress);
-    
-    ZSTD_freeCCtx(ctx);
+    BMK_freeTimedFnState(timeStateCompress);
+    BMK_freeTimedFnState(timeStateDecompress);
+
+    ZSTD_freeCCtx(cctx);
     ZSTD_freeDCtx(dctx);
 
     free(internalDstBuffer);
     free(resultBuffer);
 
-    free((void*)srcPtrs); 
-    free(srcSizes); 
-    free(cPtrs); 
+    free((void*)srcPtrs);
+    free(srcSizes);
+    free(cPtrs);
     free(cSizes);
     free(cCapacities);
     free(resPtrs);
     free(resSizes);
 
     if(allocationincomplete) {
-        EXM_THROW(31, BMK_return_t, "allocation error : not enough memory");
+        RETURN_ERROR(31, BMK_benchOutcome_t, "allocation error : not enough memory");
     }
-    
-    if(parametersConflict) {
-        EXM_THROW(32, BMK_return_t, "Conflicting input results");   
+
+    if(dstParamsError) {
+        RETURN_ERROR(32, BMK_benchOutcome_t, "Dst parameters not coherent");
     }
-    return results;
+    return outcome;
 }
 
-BMK_return_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
                         const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                         const void* dictBuffer, size_t dictBufferSize,
                         int displayLevel, const char* displayName) {
 
-    const BMK_advancedParams_t adv = BMK_initAdvancedParams();
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
     return BMK_benchMemAdvanced(srcBuffer, srcSize,
                                 NULL, 0,
                                 fileSizes, nbFiles,
@@ -800,6 +864,71 @@ BMK_return_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
                                 displayLevel, displayName, &adv);
 }
 
+static BMK_benchOutcome_t BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
+                            const size_t* fileSizes, unsigned nbFiles,
+                            int cLevel, const ZSTD_compressionParameters* comprParams,
+                            const void* dictBuffer, size_t dictBufferSize,
+                            int displayLevel, const char* displayName,
+                            BMK_advancedParams_t const * const adv)
+{
+    const char* pch = strrchr(displayName, '\\'); /* Windows */
+    if (!pch) pch = strrchr(displayName, '/');    /* Linux */
+    if (pch) displayName = pch+1;
+
+    if (adv->realTime) {
+        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
+        SET_REALTIME_PRIORITY;
+    }
+
+    if (displayLevel == 1 && !adv->additionalParam)   /* --quiet mode */
+        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
+                ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING,
+                (U32)benchedSize, adv->nbSeconds, (U32)(adv->blockSize>>10));
+
+    return BMK_benchMemAdvanced(srcBuffer, benchedSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, adv);
+}
+
+BMK_benchOutcome_t BMK_syntheticTest(int cLevel, double compressibility,
+                          const ZSTD_compressionParameters* compressionParams,
+                          int displayLevel, const BMK_advancedParams_t* adv)
+{
+    char name[20] = {0};
+    size_t const benchedSize = 10000000;
+    void* srcBuffer;
+    BMK_benchOutcome_t res;
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
+    }
+
+    /* Memory allocation */
+    srcBuffer = malloc(benchedSize);
+    if (!srcBuffer) RETURN_ERROR(21, BMK_benchOutcome_t, "not enough memory");
+
+    /* Fill input buffer */
+    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
+
+    /* Bench */
+    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
+    res = BMK_benchCLevel(srcBuffer, benchedSize,
+                    &benchedSize /* ? */, 1 /* ? */,
+                    cLevel, compressionParams,
+                    NULL, 0,  /* dictionary */
+                    displayLevel, name, adv);
+
+    /* clean up */
+    free(srcBuffer);
+
+    return res;
+}
+
+
+
 static size_t BMK_findMaxMem(U64 requiredMem)
 {
     size_t const step = 64 MB;
@@ -818,45 +947,13 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     return (size_t)(requiredMem);
 }
 
-static BMK_return_t BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
-                            const size_t* fileSizes, unsigned nbFiles,
-                            const int cLevel, const ZSTD_compressionParameters* comprParams,
-                            const void* dictBuffer, size_t dictBufferSize,
-                            int displayLevel, const char* displayName,
-                            BMK_advancedParams_t const * const adv)
-{
-    BMK_return_t res;
-
-    const char* pch = strrchr(displayName, '\\'); /* Windows */
-
-    if (!pch) pch = strrchr(displayName, '/'); /* Linux */
-    if (pch) displayName = pch+1;
-
-    if (adv->realTime) {
-        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
-        SET_REALTIME_PRIORITY;
-    }
-
-    if (displayLevel == 1 && !adv->additionalParam)
-        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n", ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING, (U32)benchedSize, adv->nbSeconds, (U32)(adv->blockSize>>10));
-
-    res = BMK_benchMemAdvanced(srcBuffer, benchedSize, 
-                NULL, 0, 
-                fileSizes, nbFiles, 
-                cLevel, comprParams, 
-                dictBuffer, dictBufferSize, 
-                displayLevel, displayName, adv);
-
-    return res;
-}
-
-
 /*! BMK_loadFiles() :
  *  Loads `buffer` with content of files listed within `fileNamesTable`.
  *  At most, fills `buffer` entirely. */
 static int BMK_loadFiles(void* buffer, size_t bufferSize,
-                          size_t* fileSizes, const char* const * const fileNamesTable, 
-                          unsigned nbFiles, int displayLevel)
+                         size_t* fileSizes,
+                         const char* const * fileNamesTable, unsigned nbFiles,
+                         int displayLevel)
 {
     size_t pos = 0, totalSize = 0;
     unsigned n;
@@ -877,9 +974,10 @@ static int BMK_loadFiles(void* buffer, size_t bufferSize,
         if (f==NULL) EXM_THROW_INT(10, "impossible to open file %s", fileNamesTable[n]);
         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[n]);
         if (fileSize > bufferSize-pos) fileSize = bufferSize-pos, nbFiles=n;   /* buffer too small - stop after this file */
-        { size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
-          if (readSize != (size_t)fileSize) EXM_THROW_INT(11, "could not read %s", fileNamesTable[n]);
-          pos += readSize; }
+        {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+            if (readSize != (size_t)fileSize) EXM_THROW_INT(11, "could not read %s", fileNamesTable[n]);
+            pos += readSize;
+        }
         fileSizes[n] = (size_t)fileSize;
         totalSize += (size_t)fileSize;
         fclose(f);
@@ -889,50 +987,53 @@ static int BMK_loadFiles(void* buffer, size_t bufferSize,
     return 0;
 }
 
-BMK_return_t BMK_benchFilesAdvanced(const char* const * const fileNamesTable, unsigned const nbFiles,
-                               const char* const dictFileName, int const cLevel, 
-                               const ZSTD_compressionParameters* const compressionParams, 
-                               int displayLevel, const BMK_advancedParams_t * const adv)
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                        const char* const * fileNamesTable, unsigned nbFiles,
+                        const char* dictFileName, int cLevel,
+                        const ZSTD_compressionParameters* compressionParams,
+                        int displayLevel, const BMK_advancedParams_t* adv)
 {
     void* srcBuffer = NULL;
     size_t benchedSize;
     void* dictBuffer = NULL;
     size_t dictBufferSize = 0;
     size_t* fileSizes = NULL;
-    BMK_return_t res;
+    BMK_benchOutcome_t res;
     U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
 
-    if(!nbFiles) {
-        EXM_THROW(14, BMK_return_t, "No Files to Benchmark");
+    if (!nbFiles) {
+        RETURN_ERROR(14, BMK_benchOutcome_t, "No Files to Benchmark");
     }
 
     if (cLevel > ZSTD_maxCLevel()) {
-        EXM_THROW(15, BMK_return_t, "Invalid Compression Level");
+        RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
     }
 
     fileSizes = (size_t*)calloc(nbFiles, sizeof(size_t));
-    if (!fileSizes) EXM_THROW(12, BMK_return_t, "not enough memory for fileSizes");
+    if (!fileSizes) RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory for fileSizes");
+
     /* Load dictionary */
     if (dictFileName != NULL) {
         U64 const dictFileSize = UTIL_getFileSize(dictFileName);
         if (dictFileSize > 64 MB) {
             free(fileSizes);
-            EXM_THROW(10, BMK_return_t, "dictionary file %s too large", dictFileName);
+            RETURN_ERROR(10, BMK_benchOutcome_t, "dictionary file %s too large", dictFileName);
         }
         dictBufferSize = (size_t)dictFileSize;
         dictBuffer = malloc(dictBufferSize);
         if (dictBuffer==NULL) {
             free(fileSizes);
-            EXM_THROW(11, BMK_return_t, "not enough memory for dictionary (%u bytes)",
+            RETURN_ERROR(11, BMK_benchOutcome_t, "not enough memory for dictionary (%u bytes)",
                             (U32)dictBufferSize);
         }
-        {
-            int errorCode = BMK_loadFiles(dictBuffer, dictBufferSize, fileSizes, &dictFileName, 1, displayLevel);
-            if(errorCode) {
-                res.error = errorCode;
+
+        {   int const errorCode = BMK_loadFiles(dictBuffer, dictBufferSize,
+                                                fileSizes, &dictFileName /*?*/,
+                                                1 /*?*/, displayLevel);
+            if (errorCode) {
+                res = BMK_benchOutcome_error();
                 goto _cleanUp;
-            }
-        }
+        }   }
     }
 
     /* Memory allocation & restrictions */
@@ -945,29 +1046,28 @@ BMK_return_t BMK_benchFilesAdvanced(const char* const * const fileNamesTable, un
     if (!srcBuffer) {
         free(dictBuffer);
         free(fileSizes);
-        EXM_THROW(12, BMK_return_t, "not enough memory");
+        RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory");
     }
 
     /* Load input buffer */
-    {
-        int errorCode = BMK_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles, displayLevel);
-        if(errorCode) {
-            res.error = errorCode;
+    {   int const errorCode = BMK_loadFiles(srcBuffer, benchedSize,
+                                        fileSizes, fileNamesTable, nbFiles,
+                                        displayLevel);
+        if (errorCode) {
+            res = BMK_benchOutcome_error();
             goto _cleanUp;
-        }
-    }
+    }   }
+
     /* Bench */
-    {
-        char mfName[20] = {0};
+    {   char mfName[20] = {0};
         snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
-        {   
-            const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
-            res = BMK_benchCLevel(srcBuffer, benchedSize, 
-                            fileSizes, nbFiles, 
-                            cLevel, compressionParams,
-                            dictBuffer, dictBufferSize, 
-                            displayLevel, displayName, 
-                            adv);
+        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
+            res = BMK_benchCLevel(srcBuffer, benchedSize,
+                                fileSizes, nbFiles,
+                                cLevel, compressionParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName,
+                                adv);
     }   }
 
 _cleanUp:
@@ -978,44 +1078,12 @@ _cleanUp:
 }
 
 
-BMK_return_t BMK_syntheticTest(int cLevel, double compressibility,
-                              const ZSTD_compressionParameters* compressionParams,
-                              int displayLevel, const BMK_advancedParams_t * const adv)
+BMK_benchOutcome_t BMK_benchFiles(
+                    const char* const * fileNamesTable, unsigned nbFiles,
+                    const char* dictFileName,
+                    int cLevel, const ZSTD_compressionParameters* compressionParams,
+                    int displayLevel)
 {
-    char name[20] = {0};
-    size_t benchedSize = 10000000;
-    void* srcBuffer;
-    BMK_return_t res;
-
-    if (cLevel > ZSTD_maxCLevel()) {
-        EXM_THROW(15, BMK_return_t, "Invalid Compression Level");
-    }
-
-    /* Memory allocation */
-    srcBuffer = malloc(benchedSize);
-    if (!srcBuffer) EXM_THROW(21, BMK_return_t, "not enough memory");
-
-    /* Fill input buffer */
-    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
-
-    /* Bench */
-    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
-    res = BMK_benchCLevel(srcBuffer, benchedSize, 
-                    &benchedSize, 1, 
-                    cLevel, compressionParams, 
-                    NULL, 0, 
-                    displayLevel, name, adv);
-
-    /* clean up */
-    free(srcBuffer);
-
-    return res;
-}
-
-BMK_return_t BMK_benchFiles(const char* const * const fileNamesTable, unsigned const nbFiles,
-                   const char* const dictFileName, 
-                   int const cLevel, const ZSTD_compressionParameters* const compressionParams, 
-                   int displayLevel) {
-    const BMK_advancedParams_t adv = BMK_initAdvancedParams();
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
     return BMK_benchFilesAdvanced(fileNamesTable, nbFiles, dictFileName, cLevel, compressionParams, displayLevel, &adv);
 }
diff --git a/programs/bench.h b/programs/bench.h
index ad4ede1f0..59f415896 100644
--- a/programs/bench.h
+++ b/programs/bench.h
@@ -15,59 +15,82 @@ extern "C" {
 #ifndef BENCH_H_121279284357
 #define BENCH_H_121279284357
 
+/* ===  Dependencies  === */
 #include <stddef.h>   /* size_t */
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
 #include "zstd.h"     /* ZSTD_compressionParameters */
 
-/* Creates a struct of type typeName with an int type .error field
- * and a .result field of some baseType. Functions with return
- * typeName pass a successful result with .error = 0 and .result
- * with the intended result, while returning an error will result
- * in .error != 0. 
+
+/* ===  Constants  === */
+
+#define MB_UNIT 1000000
+
+
+/* ===  Benchmark functions  === */
+
+/* Creates a variant `typeName`, able to express "error or valid result".
+ * Functions with return type `typeName`
+ * must first check if result is valid, using BMK_isSuccessful_*(),
+ * and only then can extract `baseType`.
  */
-#define ERROR_STRUCT(baseType, typeName) typedef struct { \
-    baseType result; \
-    int error;       \
-} typeName
+#define VARIANT_ERROR_RESULT(baseType, variantName)  \
+                                             \
+typedef struct {                             \
+    baseType internal_never_use_directly;    \
+    int tag;                                 \
+} variantName
+
 
 typedef struct {
     size_t cSize;
-    U64 cSpeed;   /* bytes / sec */
-    U64 dSpeed;
-    size_t cMem;
-} BMK_result_t;
+    unsigned long long cSpeed;   /* bytes / sec */
+    unsigned long long dSpeed;
+    size_t cMem;                 /* ? what is reported ? */
+} BMK_benchResult_t;
 
-ERROR_STRUCT(BMK_result_t, BMK_return_t);
+VARIANT_ERROR_RESULT(BMK_benchResult_t, BMK_benchOutcome_t);
 
-/* called in cli */
-/* Loads files in fileNamesTable into memory, as well as a dictionary 
- * from dictFileName, and then uses benchMem */
-/* fileNamesTable - name of files to benchmark
- * nbFiles - number of files (size of fileNamesTable), must be > 0
- * dictFileName - name of dictionary file to load
- * cLevel - compression level to benchmark, errors if invalid
- * compressionParams - basic compression Parameters
- * displayLevel - what gets printed
- *      0 : no display;   
- *      1 : errors;   
- *      2 : + result + interaction + warnings;   
- *      3 : + progression;   
- *      4 : + information
- * return 
- *      .error will give a nonzero error value if an error has occured
- *      .result - if .error = 0, .result will return the time taken to compression speed
- *          (.cSpeed), decompression speed (.dSpeed), and compressed size (.cSize) of the original
- *          file
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_benchOutcome()
  */
-BMK_return_t BMK_benchFiles(const char* const * const fileNamesTable, unsigned const nbFiles,
-                   const char* const dictFileName, 
-                   int const cLevel, const ZSTD_compressionParameters* const compressionParams, 
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome);
+
+
+/*! BMK_benchFiles() -- called by zstdcli */
+/*  Loads files from fileNamesTable into memory,
+ *  and an optional dictionary from dictFileName (can be NULL),
+ *  then uses benchMem().
+ *  fileNamesTable - name of files to benchmark.
+ *  nbFiles - number of files (size of fileNamesTable), must be > 0.
+ *  dictFileName - name of dictionary file to load.
+ *  cLevel - compression level to benchmark, errors if invalid.
+ *  compressionParams - advanced compression Parameters.
+ *  displayLevel - what gets printed:
+ *      0 : no display;
+ *      1 : errors;
+ *      2 : + result + interaction + warnings;
+ *      3 : + information;
+ *      4 : + debug
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchFiles(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
                    int displayLevel);
 
-typedef enum {
-    BMK_timeMode = 0,
-    BMK_iterMode = 1
-} BMK_loopMode_t;
 
 typedef enum {
     BMK_both = 0,
@@ -77,15 +100,14 @@ typedef enum {
 
 typedef struct {
     BMK_mode_t mode;            /* 0: all, 1: compress only 2: decode only */
-    BMK_loopMode_t loopMode;    /* if loopmode, then nbSeconds = nbLoops */
     unsigned nbSeconds;         /* default timing is in nbSeconds */
-    size_t blockSize;           /* Maximum allowable size of a block*/
+    size_t blockSize;           /* Maximum size of each block*/
     unsigned nbWorkers;         /* multithreading */
     unsigned realTime;          /* real time priority */
     int additionalParam;        /* used by python speed benchmark */
     unsigned ldmFlag;           /* enables long distance matching */
-    unsigned ldmMinMatch;       /* below: parameters for long distance matching, see zstd.1.md for meaning */
-    unsigned ldmHashLog; 
+    unsigned ldmMinMatch;       /* below: parameters for long distance matching, see zstd.1.md */
+    unsigned ldmHashLog;
     unsigned ldmBucketSizeLog;
     unsigned ldmHashEveryLog;
 } BMK_advancedParams_t;
@@ -93,132 +115,186 @@ typedef struct {
 /* returns default parameters used by nonAdvanced functions */
 BMK_advancedParams_t BMK_initAdvancedParams(void);
 
-/* See benchFiles for normal parameter uses and return, see advancedParams_t for adv */
-BMK_return_t BMK_benchFilesAdvanced(const char* const * const fileNamesTable, unsigned const nbFiles,
-                   const char* const dictFileName, 
-                   int const cLevel, const ZSTD_compressionParameters* const compressionParams, 
-                   int displayLevel, const BMK_advancedParams_t* const adv);
+/*! BMK_benchFilesAdvanced():
+ *  Same as BMK_benchFiles(),
+ *  with more controls, provided through advancedParams_t structure */
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
+                   int displayLevel, const BMK_advancedParams_t* adv);
 
-/* called in cli */
-/* Generates a sample with datagen with the compressibility argument*/
-/* cLevel - compression level to benchmark, errors if invalid
- * compressibility - determines compressibility of sample
- * compressionParams - basic compression Parameters
- * displayLevel - see benchFiles
- * adv - see advanced_Params_t
- * return 
- *      .error will give a nonzero error value if an error has occured
- *      .result - if .error = 0, .result will return the time taken to compression speed
- *          (.cSpeed), decompression speed (.dSpeed), and compressed size (.cSize) of the original
- *          file
+/*! BMK_syntheticTest() -- called from zstdcli */
+/*  Generates a sample with datagen, using compressibility argument */
+/*  cLevel - compression level to benchmark, errors if invalid
+ *  compressibility - determines compressibility of sample
+ *  compressionParams - basic compression Parameters
+ *  displayLevel - see benchFiles
+ *  adv - see advanced_Params_t
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
  */
-BMK_return_t BMK_syntheticTest(int cLevel, double compressibility,
+BMK_benchOutcome_t BMK_syntheticTest(
+                              int cLevel, double compressibility,
                               const ZSTD_compressionParameters* compressionParams,
-                              int displayLevel, const BMK_advancedParams_t * const adv);
+                              int displayLevel, const BMK_advancedParams_t* adv);
 
-/* basic benchmarking function, called in paramgrill 
- * applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
- * with specific compression parameters specified by other arguments using benchFunction
- * (cLevel, comprParams + adv in advanced Mode) */
-/* srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
- * srcSize - size of data in srcBuffer
- * cLevel - compression level  
- * comprParams - basic compression parameters
- * dictBuffer - a dictionary if used, null otherwise
- * dictBufferSize - size of dictBuffer, 0 otherwise
- * diplayLevel - see BMK_benchFiles
- * displayName - name used by display
- * return
- *      .error will give a nonzero value if an error has occured
- *      .result - if .error = 0, will give the same results as benchFiles
- *          but for the data stored in srcBuffer
+
+
+/* ===  Benchmark Zstandard in a memory-to-memory scenario  === */
+
+/** BMK_benchMem() -- core benchmarking function, called in paramgrill
+ *  applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
+ *  with specific compression parameters provided by other arguments using benchFunction
+ *  (cLevel, comprParams + adv in advanced Mode) */
+/*  srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
+ *  srcSize - size of data in srcBuffer
+ *  fileSizes - srcBuffer is considered cut into 1+ segments, to compress separately.
+ *              note : sum(fileSizes) must be == srcSize.  (<== ensure it's properly checked)
+ *  nbFiles - nb of segments
+ *  cLevel - compression level
+ *  comprParams - basic compression parameters
+ *  dictBuffer - a dictionary if used, null otherwise
+ *  dictBufferSize - size of dictBuffer, 0 otherwise
+ *  diplayLevel - see BMK_benchFiles
+ *  displayName - name used by display
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
  */
-BMK_return_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
                         const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                         const void* dictBuffer, size_t dictBufferSize,
                         int displayLevel, const char* displayName);
 
-/* See benchMem for normal parameter uses and return, see advancedParams_t for adv 
+/* BMK_benchMemAdvanced() : same as BMK_benchMem()
+ * with following additional options :
  * dstBuffer - destination buffer to write compressed output in, NULL if none provided.
  * dstCapacity - capacity of destination buffer, give 0 if dstBuffer = NULL
+ * adv = see advancedParams_t
  */
-BMK_return_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
-                        void* dstBuffer, size_t dstCapacity, 
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
                         const size_t* fileSizes, unsigned nbFiles,
-                        const int cLevel, const ZSTD_compressionParameters* comprParams,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
                         const void* dictBuffer, size_t dictBufferSize,
                         int displayLevel, const char* displayName,
                         const BMK_advancedParams_t* adv);
 
+
+
+/* ====  Benchmarking any function, iterated on a set of blocks  ==== */
+
 typedef struct {
-    size_t sumOfReturn;    /* sum of return values */
-    U64 nanoSecPerRun;     /* time per iteration */
-} BMK_customResult_t;
+    unsigned long long nanoSecPerRun;  /* time per iteration */
+    size_t sumOfReturn;       /* sum of return values */
+} BMK_runTime_t;
 
-ERROR_STRUCT(BMK_customResult_t, BMK_customReturn_t);
+VARIANT_ERROR_RESULT(BMK_runTime_t, BMK_runOutcome_t);
 
-typedef size_t (*BMK_benchFn_t)(const void*, size_t, void*, size_t, void*);
-typedef size_t (*BMK_initFn_t)(void*);
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome);
 
-/* This function times the execution of 2 argument functions, benchFn and initFn  */
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_runOutcome()
+ */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome);
+
+
+
+typedef size_t (*BMK_benchFn_t)(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* customPayload);
+typedef size_t (*BMK_initFn_t)(void* initPayload);
+
+
+/* BMK_benchFunction() :
+ * This function times the execution of 2 argument functions, benchFn and initFn  */
 
 /* benchFn - (*benchFn)(srcBuffers[i], srcSizes[i], dstBuffers[i], dstCapacities[i], benchPayload)
  *      is run nbLoops times
- * initFn - (*initFn)(initPayload) is run once per benchmark at the beginning. This argument can 
- *          be NULL, in which case nothing is run.
- * blockCount - number of blocks (size of srcBuffers, srcSizes, dstBuffers, dstCapacities)
+ * initFn - (*initFn)(initPayload) is run once per benchmark, at the beginning.
+ *      This argument can be NULL, in which case nothing is run.
+ * blockCount - number of blocks. Size of all array parameters : srcBuffers, srcSizes, dstBuffers, dstCapacities, blockResults
  * srcBuffers - an array of buffers to be operated on by benchFn
  * srcSizes - an array of the sizes of above buffers
  * dstBuffers - an array of buffers to be written into by benchFn
  * dstCapacities - an array of the capacities of above buffers
- * blockResults - the return value of benchFn called on each block.
+ * blockResults - Optional: store the return value of benchFn for each block. Use NULL if this result is not requested.
  * nbLoops - defines number of times benchFn is run.
- * assumed array of size blockCount, will have compressed size of each block written to it.
- * return 
- *      .error will give a nonzero value if ZSTD_isError() is nonzero for any of the return
- *          of the calls to initFn and benchFn, or if benchFunction errors internally
- *      .result - if .error = 0, then .result will contain the sum of all return values of 
- *          benchFn on the first iteration through all of the blocks (.sumOfReturn) and also 
- *          the time per run of benchFn (.nanoSecPerRun). For the former, this
- *          is generally intended to be used on functions which return the # of bytes written 
- *          into dstBuffer, hence this value will be the total amount of bytes written to 
- *          dstBuffer.
+ * @return: a variant, which express either an error, or can generate a valid BMK_runTime_t result.
+ *          Use BMK_isSuccessful_runOutcome() to check if function was successful.
+ *          If yes, extract the result with BMK_extract_runTime(),
+ *          it will contain :
+ *              .sumOfReturn : the sum of all return values of benchFn through all of blocks
+ *              .nanoSecPerRun : time per run of benchFn + (time for initFn / nbLoops)
+ *          .sumOfReturn is generally intended for functions which return a # of bytes written into dstBuffer,
+ *              in which case, this value will be the total amount of bytes written into dstBuffer.
  */
-BMK_customReturn_t BMK_benchFunction(BMK_benchFn_t benchFn, void* benchPayload,
+BMK_runOutcome_t BMK_benchFunction(
+                        BMK_benchFn_t benchFn, void* benchPayload,
                         BMK_initFn_t initFn, void* initPayload,
                         size_t blockCount,
-                        const void* const * const srcBuffers, const size_t* srcSizes,
-                        void * const * const dstBuffers, const size_t* dstCapacities, size_t* blockResults,  
+                        const void *const * srcBuffers, const size_t* srcSizes,
+                        void *const * dstBuffers, const size_t* dstCapacities,
+                        size_t* blockResults,
                         unsigned nbLoops);
 
 
-/* state information needed to advance computation for benchFunctionTimed */
-typedef struct BMK_timeState_t BMK_timedFnState_t;
-/* initializes timeState object with desired number of seconds */
-BMK_timedFnState_t* BMK_createTimeState(unsigned nbSeconds);
-/* resets existing timeState object */
-void BMK_resetTimeState(BMK_timedFnState_t*, unsigned nbSeconds);
-/* deletes timeState object */
-void BMK_freeTimeState(BMK_timedFnState_t* state);
 
-typedef struct {
-    BMK_customReturn_t result;
-    int completed;
-} BMK_customTimedReturn_t;
+/* ====  Benchmark any function, providing intermediate results  ==== */
 
-/* 
- * Benchmarks custom functions like BMK_benchFunction(), but runs for nbSeconds seconds rather than a fixed number of loops
- * arguments mostly the same other than BMK_benchFunction()
- * Usage - benchFunctionTimed will return in approximately one second. Keep calling BMK_benchFunctionTimed() until the return's completed field = 1. 
- * to continue updating intermediate result. Intermediate return values are returned by the function.
+/* state information tracking benchmark session */
+typedef struct BMK_timedFnState_s BMK_timedFnState_t;
+
+/* BMK_createTimedFnState() and BMK_resetTimedFnState() :
+ * Create/Set BMK_timedFnState_t for next benchmark session,
+ * which shall last a minimum of total_ms milliseconds,
+ * producing intermediate results, paced at interval of (approximately) run_ms.
  */
-BMK_customTimedReturn_t BMK_benchFunctionTimed(BMK_timedFnState_t* cont,
-    BMK_benchFn_t benchFn, void* benchPayload,
-    BMK_initFn_t initFn, void* initPayload,
-    size_t blockCount,
-    const void* const * const srcBlockBuffers, const size_t* srcBlockSizes,
-    void* const * const dstBlockBuffers, const size_t* dstBlockCapacities, size_t* blockResults);
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms);
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms);
+void BMK_freeTimedFnState(BMK_timedFnState_t* state);
+
+
+/* Tells if duration of all benchmark runs has exceeded total_ms
+ */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);
+
+
+/* BMK_benchTimedFn() :
+ * Similar to BMK_benchFunction(), most arguments being identical.
+ * Automatically determines `nbLoops` so that each result is regularly produced at interval of about run_ms.
+ * Note : minimum `nbLoops` is 1, therefore a run may last more than run_ms, and possibly even more than total_ms.
+ * Usage - initialize timedFnState, select benchmark duration (total_ms) and each measurement duration (run_ms)
+ *         call BMK_benchTimedFn() repetitively, each measurement is supposed to last about run_ms
+ *         Check if total time budget is spent or exceeded, using BMK_isCompleted_TimedFn()
+ */
+BMK_runOutcome_t BMK_benchTimedFn(
+                    BMK_timedFnState_t* timedFnState,
+                    BMK_benchFn_t benchFn, void* benchPayload,
+                    BMK_initFn_t initFn, void* initPayload,
+                    size_t blockCount,
+                    const void *const * srcBlockBuffers, const size_t* srcBlockSizes,
+                    void *const * dstBlockBuffers, const size_t* dstBlockCapacities,
+                    size_t* blockResults);
+
+
+
+
 
 #endif   /* BENCH_H_121279284357 */
 
diff --git a/programs/dibio.c b/programs/dibio.c
index 5d1f6d6c4..4b68be6c9 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -27,6 +27,7 @@
 #include <string.h>         /* memset */
 #include <stdio.h>          /* fprintf, fopen, ftello64 */
 #include <errno.h>          /* errno */
+#include <assert.h>
 
 #include "mem.h"            /* read */
 #include "error_private.h"
@@ -43,6 +44,7 @@
 #define SAMPLESIZE_MAX (128 KB)
 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
@@ -165,6 +167,7 @@ static U32 DiB_rand(U32* src)
 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
     U32 seed = 0xFD2FB528;
     unsigned i;
+    assert(nbFiles >= 1);
     for (i = nbFiles - 1; i > 0; --i) {
         unsigned const j = DiB_rand(&seed) % (i + 1);
         const char* const tmp = fileNamesTable[j];
@@ -269,16 +272,19 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCa
 
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover)
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
 {
     unsigned const displayLevel = params ? params->zParams.notificationLevel :
                         coverParams ? coverParams->zParams.notificationLevel :
+                        fastCoverParams ? fastCoverParams->zParams.notificationLevel :
                         0;   /* should never happen */
     void* const dictBuffer = malloc(maxDictSize);
     fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
     size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
+    size_t const memMult = params ? MEMMULT :
+                           coverParams ? COVER_MEMMULT:
+                           FASTCOVER_MEMMULT;
     size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
     size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
     void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
@@ -310,7 +316,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
     /* Load input buffer */
     DISPLAYLEVEL(3, "Shuffling input files\n");
     DiB_shuffle(fileNamesTable, nbFiles);
-    nbFiles = DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
 
     {   size_t dictSize;
         if (params) {
@@ -318,17 +325,36 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
             dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
                                                            srcBuffer, sampleSizes, fs.nbSamples,
                                                            *params);
-        } else if (optimizeCover) {
-            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
-                                                           srcBuffer, sampleSizes, fs.nbSamples,
-                                                           coverParams);
-            if (!ZDICT_isError(dictSize)) {
-                unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
-                DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, coverParams->steps, splitPercentage);
+        } else if (coverParams) {
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                             srcBuffer, sampleSizes, fs.nbSamples,
+                                                             coverParams);
+              if (!ZDICT_isError(dictSize)) {
+                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
+                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
+                              coverParams->steps, splitPercentage);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                                     sampleSizes, fs.nbSamples, *coverParams);
             }
         } else {
-            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                                   sampleSizes, fs.nbSamples, *coverParams);
+            assert(fastCoverParams != NULL);
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
+                                                              srcBuffer, sampleSizes, fs.nbSamples,
+                                                              fastCoverParams);
+              if (!ZDICT_isError(dictSize)) {
+                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
+                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
+                            fastCoverParams->accel);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
+                                                        sampleSizes, fs.nbSamples, *fastCoverParams);
+            }
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
diff --git a/programs/dibio.h b/programs/dibio.h
index 499e30365..ea163fe6a 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -33,7 +33,7 @@
 */
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover);
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
 
 #endif
diff --git a/programs/fileio.c b/programs/fileio.c
index aeacd0440..6ea43c902 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -30,6 +30,10 @@
 #include <stdlib.h>     /* malloc, free */
 #include <string.h>     /* strcmp, strlen */
 #include <errno.h>      /* errno */
+#include <signal.h>
+#ifndef _WIN32
+#include <execinfo.h>   /* backtrace, backtrace_symbols */
+#endif
 
 #if defined (_MSC_VER)
 #  include <sys/stat.h>
@@ -125,8 +129,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 /*-************************************
 *  Signal (Ctrl-C trapping)
 **************************************/
-#include  <signal.h>
-
 static const char* g_artefact = NULL;
 static void INThandler(int sig)
 {
@@ -158,7 +160,60 @@ static void clearHandler(void)
 }
 
 
-/* ************************************************************
+/*-*********************************************************
+*  Termination signal trapping (Print debug stack trace)
+***********************************************************/
+#define MAX_STACK_FRAMES    50
+
+#ifndef _WIN32
+static void ABRThandler(int sig) {
+    const char* name;
+    void* addrlist[MAX_STACK_FRAMES];
+    char** symbollist;
+    U32 addrlen, i;
+
+    switch (sig) {
+        case SIGABRT: name = "SIGABRT"; break;
+        case SIGFPE: name = "SIGFPE"; break;
+        case SIGILL: name = "SIGILL"; break;
+        case SIGINT: name = "SIGINT"; break;
+        case SIGSEGV: name = "SIGSEGV"; break;
+        default: name = "UNKNOWN";
+    }
+
+    DISPLAY("Caught %s signal, printing stack:\n", name);
+    /* Retrieve current stack addresses. */
+    addrlen = backtrace(addrlist, MAX_STACK_FRAMES);
+    if (addrlen == 0) {
+        DISPLAY("\n");
+        return;
+    }
+    /* Create readable strings to each frame. */
+    symbollist = backtrace_symbols(addrlist, addrlen);
+    /* Print the stack trace, excluding calls handling the signal. */
+    for (i = ZSTD_START_SYMBOLLIST_FRAME; i < addrlen; i++) {
+        DISPLAY("%s\n", symbollist[i]);
+    }
+    free(symbollist);
+    /* Reset and raise the signal so default handler runs. */
+    signal(sig, SIG_DFL);
+    raise(sig);
+}
+#endif
+
+void FIO_addAbortHandler()
+{
+#ifndef _WIN32
+    signal(SIGABRT, ABRThandler);
+    signal(SIGFPE, ABRThandler);
+    signal(SIGILL, ABRThandler);
+    signal(SIGSEGV, ABRThandler);
+    signal(SIGBUS, ABRThandler);
+#endif
+}
+
+
+/*-************************************************************
 * Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && _MSC_VER >= 1400
@@ -1119,8 +1174,8 @@ int FIO_compressMultipleFilenames(const char** inFileNamesTable, unsigned nbFile
                 if (!dstFileName) {
                     EXM_THROW(30, "zstd: %s", strerror(errno));
             }   }
-            strcpy(dstFileName, inFileNamesTable[u]);
-            strcat(dstFileName, suffix);
+            strncpy(dstFileName, inFileNamesTable[u], ifnSize+1 /* Include null */);
+            strncat(dstFileName, suffix, suffixSize);
             missed_files += FIO_compressFilename_dstFile(ress, dstFileName, inFileNamesTable[u], compressionLevel);
     }   }
 
diff --git a/programs/fileio.h b/programs/fileio.h
index 9c1f357a6..94789929e 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -96,6 +96,9 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
                                     const char* dictFileName);
 
 
+/* custom crash signal handler */
+void FIO_addAbortHandler(void);
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/platform.h b/programs/platform.h
index a550eb1a3..89eba37ec 100644
--- a/programs/platform.h
+++ b/programs/platform.h
@@ -148,6 +148,17 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
 #endif
 
 
+#ifndef ZSTD_START_SYMBOLLIST_FRAME
+#  ifdef __linux__
+#    define ZSTD_START_SYMBOLLIST_FRAME 2
+#  elif defined __APPLE__
+#    define ZSTD_START_SYMBOLLIST_FRAME 4
+#  else
+#    define ZSTD_START_SYMBOLLIST_FRAME 0
+#  endif
+#endif
+
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/util.h b/programs/util.h
index 4392a5bd0..8993bf964 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -319,15 +319,18 @@ UTIL_STATIC U32 UTIL_isDirectory(const char* infilename)
 
 UTIL_STATIC U32 UTIL_isLink(const char* infilename)
 {
-#if defined(_WIN32)
-    /* no symlinks on windows */
-    (void)infilename;
-#else
+/* macro guards, as defined in : https://linux.die.net/man/2/lstat */
+#if defined(_BSD_SOURCE) \
+    || (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE >= 500)) \
+    || (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) \
+    || (defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) \
+    || (defined(__APPLE__) && defined(__MACH__))
     int r;
     stat_t statbuf;
     r = lstat(infilename, &statbuf);
     if (!r && S_ISLNK(statbuf.st_mode)) return 1;
 #endif
+    (void)infilename;
     return 0;
 }
 
@@ -526,7 +529,10 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
  * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
  * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
  */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
+UTIL_STATIC const char**
+UTIL_createFileList(const char **inputNames, unsigned inputNamesNb,
+                    char** allocatedBuffer, unsigned* allocatedNamesNb,
+                    int followLinks)
 {
     size_t pos;
     unsigned i, nbFiles;
diff --git a/programs/zstd.1 b/programs/zstd.1
index 58473102e..53fedbb9b 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -194,7 +194,7 @@ All arguments after \fB\-\-\fR are treated as files
 Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
 .
 .IP
-Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\.  The cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Equivalent to \fB\-\-train\-fastCover=d=8,steps=4\fR\.
 .
 .TP
 \fB\-o file\fR
@@ -240,6 +240,25 @@ Examples:
 \fBzstd \-\-train\-cover=k=50 FILEs\fR
 .
 .TP
+\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
+Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split
+.
+.IP
+If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75. If \fIf\fR is not specified, then it tries \fIf\fR = 20. Requires that 0 < \fIf\fR < 32. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1. Requires that 0 < \fIaccel\fR <= 10. Requires that \fId\fR = 6 or \fId\fR = 8.
+.
+.IP
+\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR. The subsegment is hashed to an index in the range [0,2^\fIf\fR - 1]. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency. Using a higher \fIf\fR reduces collision but takes longer.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-fastcover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
+.
+.TP
 \fB\-\-train\-legacy[=selectivity=#]\fR
 Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
 .
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
index b662758ac..48d39474e 100644
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -207,9 +207,10 @@ Compression of small files similar to the sample set will be greatly improved.
     (for example, 10 MB for a 100 KB dictionary).
 
     Supports multithreading if `zstd` is compiled with threading support.
-    Additional parameters can be specified with `--train-cover`.
+    Additional parameters can be specified with `--train-fastcover`.
     The legacy dictionary builder can be accessed with `--train-legacy`.
-    Equivalent to `--train-cover=d=8,steps=4`.
+    The cover dictionary builder can be accessed with `--train-cover`.
+    Equivalent to `--train-fastcover=d=8,steps=4`.
 * `-o file`:
     Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
@@ -261,6 +262,26 @@ Compression of small files similar to the sample set will be greatly improved.
 
     `zstd --train-cover=k=50,split=60 FILEs`
 
+* `--train-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]`:
+    Same as cover but with extra parameters _f_ and _accel_ and different default value of split
+    If _split_ is not specified, then it tries _split_ = 75.
+    If _f_ is not specified, then it tries _f_ = 20.
+    Requires that 0 < _f_ < 32.
+    If _accel_ is not specified, then it tries _accel_ = 1.
+    Requires that 0 < _accel_ <= 10.
+    Requires that _d_ = 6 or _d_ = 8.
+
+    _f_ is log of size of array that keeps track of frequency of subsegments of size _d_.
+    The subsegment is hashed to an index in the range [0,2^_f_ - 1].
+    It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency.
+    Using a higher _f_ reduces collision but takes longer.
+
+    Examples:
+
+    `zstd --train-fastcover FILEs`
+
+    `zstd --train-fastcover=d=8,f=15,accel=2 FILEs`
+
 * `--train-legacy[=selectivity=#]`:
     Use legacy dictionary builder algorithm with the given dictionary
     _selectivity_ (default: 9).
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index d1bb5ed25..36140c496 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -84,7 +84,10 @@ static U32 g_ldmMinMatch = 0;
 static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT;
 static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;
 
-#define DEFAULT_SPLITPOINT 1.0
+
+#define DEFAULT_ACCEL 1
+
+typedef enum { cover, fastCover, legacy } dictType;
 
 /*-************************************
 *  Display Macros
@@ -173,6 +176,7 @@ static int usage_advanced(const char* programName)
     DISPLAY( "Dictionary builder : \n");
     DISPLAY( "--train ## : create a dictionary from a training set of files \n");
     DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
     DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
     DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
     DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@@ -296,6 +300,33 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
     return 1;
 }
 
+/**
+ * parseFastCoverParameters() :
+ * reads fastcover parameters from *stringPtr (e.g. "--train-fastcover=k=48,d=8,f=20,steps=32,accel=2") into *params
+ * @return 1 means that fastcover parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_params_t* params)
+{
+    memset(params, 0, sizeof(*params));
+    for (; ;) {
+        if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "f=")) { params->f = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "accel=")) { params->accel = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          unsigned splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
+        return 0;
+    }
+    if (stringPtr[0] != 0) return 0;
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
+    return 1;
+}
+
 /**
  * parseLegacyParameters() :
  * reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity
@@ -317,7 +348,19 @@ static ZDICT_cover_params_t defaultCoverParams(void)
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
-    params.splitPoint = DEFAULT_SPLITPOINT;
+    params.splitPoint = 1.0;
+    return params;
+}
+
+static ZDICT_fastCover_params_t defaultFastCoverParams(void)
+{
+    ZDICT_fastCover_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.f = 20;
+    params.steps = 4;
+    params.splitPoint = 0.75; /* different from default splitPoint of cover */
+    params.accel = DEFAULT_ACCEL;
     return params;
 }
 #endif
@@ -433,7 +476,8 @@ int main(int argCount, const char* argv[])
 #endif
 #ifndef ZSTD_NODICT
     ZDICT_cover_params_t coverParams = defaultCoverParams();
-    int cover = 1;
+    ZDICT_fastCover_params_t fastCoverParams = defaultFastCoverParams();
+    dictType dict = fastCover;
 #endif
 #ifndef ZSTD_NOBENCH
     BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
@@ -469,6 +513,9 @@ int main(int argCount, const char* argv[])
     if (exeNameMatch(programName, ZSTD_UNLZ4)) { operation=zom_decompress; FIO_setCompressionType(FIO_lz4Compression); }                                   /* behave like unlz4, also supports multiple formats */
     memset(&compressionParams, 0, sizeof(compressionParams));
 
+    /* init crash handler */
+    FIO_addAbortHandler();
+
     /* command switches */
     for (argNb=1; argNb<argCount; argNb++) {
         const char* argument = argv[argNb];
@@ -533,18 +580,29 @@ int main(int argCount, const char* argv[])
                       operation = zom_train;
                       if (outFileName == NULL)
                           outFileName = g_defaultDictName;
-                      cover = 1;
+                      dict = cover;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
                       else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
                       continue;
                     }
+                    if (longCommandWArg(&argument, "--train-fastcover")) {
+                      operation = zom_train;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = fastCover;
+                      /* Allow optional arguments following an = */
+                      if (*argument == 0) { memset(&fastCoverParams, 0, sizeof(fastCoverParams)); }
+                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+                      else if (!parseFastCoverParameters(argument, &fastCoverParams)) { CLEAN_RETURN(badusage(programName)); }
+                      continue;
+                    }
                     if (longCommandWArg(&argument, "--train-legacy")) {
                       operation = zom_train;
                       if (outFileName == NULL)
                           outFileName = g_defaultDictName;
-                      cover = 0;
+                      dict = legacy;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { continue; }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
@@ -849,13 +907,13 @@ int main(int argCount, const char* argv[])
         if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
         if (cLevelLast < cLevel) cLevelLast = cLevel;
         if (cLevelLast > cLevel)
-            DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
+            DISPLAYLEVEL(3, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
         if(filenameIdx) {
             if(separateFiles) {
                 unsigned i;
                 for(i = 0; i < filenameIdx; i++) {
                     int c;
-                    DISPLAYLEVEL(2, "Benchmarking %s \n", filenameTable[i]);
+                    DISPLAYLEVEL(3, "Benchmarking %s \n", filenameTable[i]);
                     for(c = cLevel; c <= cLevelLast; c++) {
                         BMK_benchFilesAdvanced(&filenameTable[i], 1, dictFileName, c, &compressionParams, g_displayLevel, &benchParams);
                     }
@@ -884,17 +942,22 @@ int main(int argCount, const char* argv[])
         zParams.compressionLevel = dictCLevel;
         zParams.notificationLevel = g_displayLevel;
         zParams.dictID = dictID;
-        if (cover) {
+        if (dict == cover) {
             int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = nbWorkers;
             coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, NULL, optimize);
+        } else if (dict == fastCover) {
+            int const optimize = !fastCoverParams.k || !fastCoverParams.d;
+            fastCoverParams.nbThreads = nbWorkers;
+            fastCoverParams.zParams = zParams;
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, NULL, &fastCoverParams, optimize);
         } else {
             ZDICT_legacy_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
             dictParams.selectivityLevel = dictSelect;
             dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, NULL, 0);
         }
 #endif
         goto _end;
diff --git a/tests/.gitignore b/tests/.gitignore
index 4911b2d62..da536251d 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -26,6 +26,7 @@ invalidDictionaries
 checkTag
 zcat
 zstdcat
+tm
 
 # Tmp test directory
 zstdtest
diff --git a/tests/Makefile b/tests/Makefile
index 88a5d763c..d0d798fc3 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -200,7 +200,7 @@ zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c  # xxh symbols not exposed from dll
 zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)
 
-paramgrill : DEBUGFLAGS = -DNDEBUG  # turn off assert() for speed measurements
+paramgrill : DEBUGFLAGS =  # turn off assert() by default for speed measurements
 paramgrill : $(ZSTD_FILES) $(PRGDIR)/bench.c $(PRGDIR)/datagen.c paramgrill.c
 	$(CC) $(FLAGS) $^ -lm -o $@$(EXT)
 
diff --git a/tests/decodecorpus.c b/tests/decodecorpus.c
index 936d307f4..2c2276004 100644
--- a/tests/decodecorpus.c
+++ b/tests/decodecorpus.c
@@ -620,6 +620,8 @@ static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
 }
 
 static inline void initSeqStore(seqStore_t *seqStore) {
+    seqStore->maxNbSeq = MAX_NB_SEQ;
+    seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
     seqStore->sequencesStart = SEQUENCE_BUFFER;
     seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
     seqStore->llCode = SEQUENCE_LLCODE;
diff --git a/tests/fullbench.c b/tests/fullbench.c
index 12c1e1ae4..fd4815c91 100644
--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@@ -51,6 +51,8 @@
 #define COMPRESSIBILITY_DEFAULT 0.50
 static const size_t g_sampleSize = 10000000;
 
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+
 
 /*_************************************
 *  Macros
@@ -92,63 +94,30 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     return (size_t) requiredMem;
 }
 
-/*_*******************************************************
-*  Argument Parsing
-*********************************************************/
-
-#define ERROR_OUT(msg) { DISPLAY("%s \n", msg); exit(1); }
-
- static unsigned readU32FromChar(const char** stringPtr)
-{
-    const char errorMsg[] = "error: numeric value too large";
-    unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        unsigned const max = (((unsigned)(-1)) / 10) - 1;
-        if (result > max) ERROR_OUT(errorMsg);
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
-    }
-    if ((**stringPtr=='K') || (**stringPtr=='M')) {
-        unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) ERROR_OUT(errorMsg);
-        result <<= 10;
-        if (**stringPtr=='M') {
-            if (result > maxK) ERROR_OUT(errorMsg);
-            result <<= 10;
-        }
-        (*stringPtr)++;  /* skip `K` or `M` */
-        if (**stringPtr=='i') (*stringPtr)++;
-        if (**stringPtr=='B') (*stringPtr)++;
-    }
-    return result;
-}
-
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
-{
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
-}
 
 /*_*******************************************************
 *  Benchmark wrappers
 *********************************************************/
 
-
 static ZSTD_CCtx* g_zcc = NULL;
 
-size_t local_ZSTD_compress(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
+static size_t
+local_ZSTD_compress(const void* src, size_t srcSize,
+                    void* dst, size_t dstSize,
+                    void* buff2)
 {
     ZSTD_parameters p;
-    ZSTD_frameParameters f = {1 /* contentSizeHeader*/, 0, 0};
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
     p.fParams = f;
     p.cParams = *(ZSTD_compressionParameters*)buff2;
-    return ZSTD_compress_advanced (g_zcc,dst, dstSize, src, srcSize, NULL ,0, p);
+    return ZSTD_compress_advanced (g_zcc, dst, dstSize, src, srcSize, NULL ,0, p);
     //return ZSTD_compress(dst, dstSize, src, srcSize, cLevel);
 }
 
 static size_t g_cSize = 0;
-size_t local_ZSTD_decompress(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
+static size_t local_ZSTD_decompress(const void* src, size_t srcSize,
+                                    void* dst, size_t dstSize,
+                                    void* buff2)
 {
     (void)src; (void)srcSize;
     return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
@@ -174,7 +143,10 @@ size_t local_ZSTD_decodeSeqHeaders(const void* src, size_t srcSize, void* dst, s
 #endif
 
 static ZSTD_CStream* g_cstream= NULL;
-size_t local_ZSTD_compressStream(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compressStream(const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity,
+                          void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -194,7 +166,10 @@ size_t local_ZSTD_compressStream(const void* src, size_t srcSize, void* dst, siz
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_end(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_end(const void* src, size_t srcSize,
+                                void* dst, size_t dstCapacity,
+                                void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -209,7 +184,10 @@ static size_t local_ZSTD_compress_generic_end(const void* src, size_t srcSize, v
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_continue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_continue(const void* src, size_t srcSize,
+                                     void* dst, size_t dstCapacity,
+                                     void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -225,7 +203,10 @@ static size_t local_ZSTD_compress_generic_continue(const void* src, size_t srcSi
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize,
+                                   void* dst, size_t dstCapacity,
+                                   void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -241,7 +222,10 @@ static size_t local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_T2_continue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_compress_generic_T2_continue(const void* src, size_t srcSize,
+                                        void* dst, size_t dstCapacity,
+                                        void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -259,7 +243,10 @@ static size_t local_ZSTD_compress_generic_T2_continue(const void* src, size_t sr
 }
 
 static ZSTD_DStream* g_dstream= NULL;
-static size_t local_ZSTD_decompressStream(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+static size_t
+local_ZSTD_decompressStream(const void* src, size_t srcSize,
+                            void* dst, size_t dstCapacity,
+                            void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -276,10 +263,12 @@ static size_t local_ZSTD_decompressStream(const void* src, size_t srcSize, void*
 }
 
 #ifndef ZSTD_DLL_IMPORT
-size_t local_ZSTD_compressContinue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+size_t local_ZSTD_compressContinue(const void* src, size_t srcSize,
+                                   void* dst, size_t dstCapacity,
+                                   void* buff2)
 {
     ZSTD_parameters p;
-    ZSTD_frameParameters f = {1 /* contentSizeHeader*/, 0, 0};
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
     p.fParams = f;
     p.cParams = *(ZSTD_compressionParameters*)buff2;
     ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
@@ -287,26 +276,38 @@ size_t local_ZSTD_compressContinue(const void* src, size_t srcSize, void* dst, s
 }
 
 #define FIRST_BLOCK_SIZE 8
-size_t local_ZSTD_compressContinue_extDict(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+size_t local_ZSTD_compressContinue_extDict(const void* src, size_t srcSize,
+                                           void* dst, size_t dstCapacity,
+                                           void* buff2)
 {
     BYTE firstBlockBuf[FIRST_BLOCK_SIZE];
-    
+
     ZSTD_parameters p;
-    ZSTD_frameParameters f = {1 , 0, 0};
+    ZSTD_frameParameters f = { 1, 0, 0 };
     p.fParams = f;
     p.cParams = *(ZSTD_compressionParameters*)buff2;
     ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
     memcpy(firstBlockBuf, src, FIRST_BLOCK_SIZE);
 
-    {   size_t const compressResult = ZSTD_compressContinue(g_zcc, dst, dstCapacity, firstBlockBuf, FIRST_BLOCK_SIZE);
-        if (ZSTD_isError(compressResult)) { DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n", ZSTD_getErrorName(compressResult)); return compressResult; }
+    {   size_t const compressResult = ZSTD_compressContinue(g_zcc,
+                                            dst, dstCapacity,
+                                            firstBlockBuf, FIRST_BLOCK_SIZE);
+        if (ZSTD_isError(compressResult)) {
+            DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n",
+                    ZSTD_getErrorName(compressResult));
+            return compressResult;
+        }
         dst = (BYTE*)dst + compressResult;
         dstCapacity -= compressResult;
     }
-    return ZSTD_compressEnd(g_zcc, dst, dstCapacity, (const BYTE*)src + FIRST_BLOCK_SIZE, srcSize - FIRST_BLOCK_SIZE);
+    return ZSTD_compressEnd(g_zcc, dst, dstCapacity,
+                            (const BYTE*)src + FIRST_BLOCK_SIZE,
+                            srcSize - FIRST_BLOCK_SIZE);
 }
 
-size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* buff2)
+size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize,
+                                           void* dst, size_t dstCapacity,
+                                           void* buff2)
 {
     size_t regeneratedSize = 0;
     const BYTE* ip = (const BYTE*)buff2;
@@ -314,7 +315,7 @@ size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize, void* dst,
     BYTE* op = (BYTE*)dst;
     size_t remainingCapacity = dstCapacity;
 
-    (void)src; (void)srcSize;
+    (void)src; (void)srcSize;  /* unused */
     ZSTD_decompressBegin(g_zdc);
     while (ip < iend) {
         size_t const iSize = ZSTD_nextSrcSizeToDecompress(g_zdc);
@@ -333,14 +334,16 @@ size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize, void* dst,
 /*_*******************************************************
 *  Bench functions
 *********************************************************/
-static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel, ZSTD_compressionParameters* cparams)
+static size_t benchMem(U32 benchNb,
+                       const void* src, size_t srcSize,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
-    BYTE*  dstBuff;
     size_t dstBuffSize = ZSTD_compressBound(srcSize);
-    void*  buff2, *buff1;
+    BYTE*  dstBuff;
+    void*  dstBuff2;
+    void*  buff2;
     const char* benchName;
     BMK_benchFn_t benchFunction;
-    BMK_customReturn_t r;
     int errorcode = 0;
 
     /* Selection */
@@ -393,56 +396,56 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel,
 
     /* Allocation */
     dstBuff = (BYTE*)malloc(dstBuffSize);
-    buff2 = malloc(dstBuffSize);
-    if ((!dstBuff) || (!buff2)) {
+    dstBuff2 = malloc(dstBuffSize);
+    if ((!dstBuff) || (!dstBuff2)) {
         DISPLAY("\nError: not enough memory!\n");
-        free(dstBuff); free(buff2);
+        free(dstBuff); free(dstBuff2);
         return 12;
     }
-    buff1 = buff2;
+    buff2 = dstBuff2;
     if (g_zcc==NULL) g_zcc = ZSTD_createCCtx();
     if (g_zdc==NULL) g_zdc = ZSTD_createDCtx();
     if (g_cstream==NULL) g_cstream = ZSTD_createCStream();
     if (g_dstream==NULL) g_dstream = ZSTD_createDStream();
 
-    /* DISPLAY("params: cLevel %d, wlog %d hlog %d clog %d slog %d slen %d tlen %d strat %d \n"
-        , cLevel, cparams->windowLog, cparams->hashLog, cparams->chainLog, cparams->searchLog, 
-          cparams->searchLength, cparams->targetLength, cparams->strategy);*/
+    /* DISPLAY("params: cLevel %d, wlog %d hlog %d clog %d slog %d slen %d tlen %d strat %d \n",
+          cLevel, cparams->windowLog, cparams->hashLog, cparams->chainLog, cparams->searchLog,
+          cparams->searchLength, cparams->targetLength, cparams->strategy); */
 
     ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionLevel, cLevel);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_windowLog, cparams->windowLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_hashLog, cparams->hashLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_chainLog, cparams->chainLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_searchLog, cparams->searchLog);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_minMatch, cparams->searchLength);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_targetLength, cparams->targetLength);
-    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionStrategy, cparams->strategy);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionStrategy, cparams.strategy);
 
 
     ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, cLevel);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_windowLog, cparams->windowLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_hashLog, cparams->hashLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_chainLog, cparams->chainLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_searchLog, cparams->searchLog);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_minMatch, cparams->searchLength);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_targetLength, cparams->targetLength);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionStrategy, cparams->strategy);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionStrategy, cparams.strategy);
 
     /* Preparation */
     switch(benchNb)
     {
     case 1:
-        buff2 = (void*)cparams;
+        buff2 = &cparams;
         break;
     case 2:
         g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
 #ifndef ZSTD_DLL_IMPORT
     case 11:
-        buff2 = (void*)cparams;
+        buff2 = &cparams;
         break;
     case 12:
-        buff2 = (void*)cparams;
+        buff2 = &cparams;
         break;
     case 13 :
         g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
@@ -494,8 +497,8 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel,
     case 31:
         goto _cleanOut;
 #endif
-    case 41 : 
-        buff2 = (void*)cparams;
+    case 41 :
+        buff2 = &cparams;
         break;
     case 42 :
         g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
@@ -507,29 +510,50 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb, int cLevel,
     default : ;
     }
 
-
-     /* warming up memory */
+     /* warming up dstBuff */
     { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }
 
-
     /* benchmark loop */
-    {
-        void* dstBuffv = (void*)dstBuff;
-        r = BMK_benchFunction(benchFunction, buff2, 
-            NULL, NULL,  1, &src, &srcSize, 
-            &dstBuffv, &dstBuffSize, NULL, g_nbIterations);
-        if(r.error) {
-            DISPLAY("ERROR %d ! ! \n", r.error);
-            errorcode = r.error;
-            goto _cleanOut;
-        }
+    {   BMK_timedFnState_t* const tfs = BMK_createTimedFnState(g_nbIterations * 1000, 1000);
+        BMK_runTime_t bestResult;
+        bestResult.sumOfReturn = 0;
+        bestResult.nanoSecPerRun = (unsigned long long)(-1LL);
+        assert(tfs != NULL);
+        for (;;) {
+            void* const dstBuffv = dstBuff;
+            BMK_runOutcome_t const bOutcome =
+                    BMK_benchTimedFn( tfs,
+                            benchFunction, buff2,
+                            NULL, NULL,   /* initFn */
+                            1,  /* blockCount */
+                            &src, &srcSize,
+                            &dstBuffv, &dstBuffSize,
+                            NULL);
 
-        DISPLAY("%2u#Speed: %f MB/s - Size: %f MB - %s\n", benchNb, (double)srcSize / r.result.nanoSecPerRun * 1000, (double)r.result.sumOfReturn / 1000000, benchName);
+            if (!BMK_isSuccessful_runOutcome(bOutcome)) {
+                DISPLAY("ERROR benchmarking function ! ! \n");
+                errorcode = 1;
+                goto _cleanOut;
+            }
+
+            {   BMK_runTime_t const newResult = BMK_extract_runTime(bOutcome);
+                if (newResult.nanoSecPerRun < bestResult.nanoSecPerRun )
+                    bestResult.nanoSecPerRun = newResult.nanoSecPerRun;
+                DISPLAY("\r%2u#%-29.29s:%8.1f MB/s  (%8u) ",
+                        benchNb, benchName,
+                        (double)srcSize * TIMELOOP_NANOSEC / bestResult.nanoSecPerRun / MB_UNIT,
+                        (unsigned)newResult.sumOfReturn );
+            }
+
+            if ( BMK_isCompleted_TimedFn(tfs) ) break;
+        }
+        BMK_freeTimedFnState(tfs);
     }
-    
+    DISPLAY("\n");
+
 _cleanOut:
-    free(buff1);
     free(dstBuff);
+    free(dstBuff2);
     ZSTD_freeCCtx(g_zcc); g_zcc=NULL;
     ZSTD_freeDCtx(g_zdc); g_zdc=NULL;
     ZSTD_freeCStream(g_cstream); g_cstream=NULL;
@@ -538,87 +562,138 @@ _cleanOut:
 }
 
 
-static int benchSample(U32 benchNb, int cLevel, ZSTD_compressionParameters* cparams)
+static int benchSample(U32 benchNb,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
     size_t const benchedSize = g_sampleSize;
-    const char* name = "Sample 10MiB";
+    const char* const name = "Sample 10MiB";
 
     /* Allocation */
-    void* origBuff = malloc(benchedSize);
+    void* const origBuff = malloc(benchedSize);
     if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); return 12; }
 
     /* Fill buffer */
     RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);
 
     /* bench */
-    DISPLAY("\r%79s\r", "");
+    DISPLAY("\r%70s\r", "");
     DISPLAY(" %s : \n", name);
-    if (benchNb)
-        benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
-    else
-        for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
+    if (benchNb) {
+        benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    } else {  /* 0 == run all tests */
+        for (benchNb=0; benchNb<100; benchNb++) {
+            benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    }   }
 
     free(origBuff);
     return 0;
 }
 
 
-static int benchFiles(const char** fileNamesTable, const int nbFiles, U32 benchNb, int cLevel, ZSTD_compressionParameters* cparams)
+static int benchFiles(U32 benchNb,
+                      const char** fileNamesTable, const int nbFiles,
+                      int cLevel, ZSTD_compressionParameters cparams)
 {
     /* Loop for each file */
     int fileIdx;
     for (fileIdx=0; fileIdx<nbFiles; fileIdx++) {
         const char* const inFileName = fileNamesTable[fileIdx];
         FILE* const inFile = fopen( inFileName, "rb" );
-        U64   inFileSize;
         size_t benchedSize;
-        void* origBuff;
 
         /* Check file existence */
         if (inFile==NULL) { DISPLAY( "Pb opening %s\n", inFileName); return 11; }
 
         /* Memory allocation & restrictions */
-        inFileSize = UTIL_getFileSize(inFileName);
-        if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-            DISPLAY( "Cannot measure size of %s\n", inFileName);
-            fclose(inFile);
-            return 11;
-        }
-        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-        if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-        if (benchedSize < inFileSize)
-            DISPLAY("Not enough memory for '%s' full size; testing %u MB only...\n", inFileName, (U32)(benchedSize>>20));
-
-        /* Alloc */
-        origBuff = malloc(benchedSize);
-        if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
-
-        /* Fill input buffer */
-        DISPLAY("Loading %s...       \r", inFileName);
-        {
-            size_t readSize = fread(origBuff, 1, benchedSize, inFile);
-            fclose(inFile);
-            if (readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-                free(origBuff);
-                return 13;
+        {   U64 const inFileSize = UTIL_getFileSize(inFileName);
+            if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
+                DISPLAY( "Cannot measure size of %s\n", inFileName);
+                fclose(inFile);
+                return 11;
+            }
+            benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
+            if ((U64)benchedSize > inFileSize)
+                benchedSize = (size_t)inFileSize;
+            if ((U64)benchedSize < inFileSize) {
+                DISPLAY("Not enough memory for '%s' full size; testing %u MB only... \n",
+                        inFileName, (U32)(benchedSize>>20));
         }   }
 
-        /* bench */
-        DISPLAY("\r%79s\r", "");
-        DISPLAY(" %s : \n", inFileName);
-        if (benchNb)
-            benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
-        else
-            for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb, cLevel, cparams);
+        /* Alloc */
+        {   void* const origBuff = malloc(benchedSize);
+            if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
 
-        free(origBuff);
-    }
+            /* Fill input buffer */
+            DISPLAY("Loading %s...       \r", inFileName);
+            {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
+                fclose(inFile);
+                if (readSize != benchedSize) {
+                    DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+                    free(origBuff);
+                    return 13;
+            }   }
+
+            /* bench */
+            DISPLAY("\r%70s\r", "");   /* blank line */
+            DISPLAY(" %s : \n", inFileName);
+            if (benchNb) {
+                benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            } else {
+                for (benchNb=0; benchNb<100; benchNb++) {
+                    benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            }   }
+
+            free(origBuff);
+    }   }
 
     return 0;
 }
 
 
+
+/*_*******************************************************
+*  Argument Parsing
+*********************************************************/
+
+#define ERROR_OUT(msg) { DISPLAY("%s \n", msg); exit(1); }
+
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) ERROR_OUT(errorMsg);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) ERROR_OUT(errorMsg);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) ERROR_OUT(errorMsg);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+/*_*******************************************************
+*  Command line
+*********************************************************/
+
 static int usage(const char* exename)
 {
     DISPLAY( "Usage :\n");
@@ -649,8 +724,8 @@ static int badusage(const char* exename)
 
 int main(int argc, const char** argv)
 {
-    int i, filenamesStart=0, result;
-    const char* exename = argv[0];
+    int argNb, filenamesStart=0, result;
+    const char* const exename = argv[0];
     const char* input_filename = NULL;
     U32 benchNb = 0, main_pause = 0;
     int cLevel = DEFAULT_CLEVEL;
@@ -659,8 +734,8 @@ int main(int argc, const char** argv)
     DISPLAY(WELCOME_MESSAGE);
     if (argc<1) return badusage(exename);
 
-    for(i=1; i<argc; i++) {
-        const char* argument = argv[i];
+    for (argNb=1; argNb<argc; argNb++) {
+        const char* argument = argv[argNb];
         assert(argument != NULL);
 
         if (longCommandWArg(&argument, "--zstd=")) {
@@ -677,12 +752,14 @@ int main(int argc, const char** argv)
                 return 1;
             }
 
+            /* check end of string */
             if (argument[0] != 0) {
-                DISPLAY("invalid --zstd= format\n");
-                return 1; // check the end of string 
+                DISPLAY("invalid --zstd= format \n");
+                return 1;
             } else {
                 continue;
             }
+
         } else if (argument[0]=='-') { /* Commands (note : aggregated commands are allowed) */
             argument++;
             while (argument[0]!=0) {
@@ -698,35 +775,27 @@ int main(int argc, const char** argv)
 
                     /* Select specific algorithm to bench */
                 case 'b':
-                    {
-                        argument++;
-                        benchNb = readU32FromChar(&argument);
-                        break;
-                    }
+                    argument++;
+                    benchNb = readU32FromChar(&argument);
+                    break;
 
                     /* Modify Nb Iterations */
                 case 'i':
-                    {
-                        argument++;
-                        BMK_SetNbIterations((int)readU32FromChar(&argument));
-                    }
+                    argument++;
+                    BMK_SetNbIterations((int)readU32FromChar(&argument));
                     break;
 
                     /* Select compressibility of synthetic sample */
                 case 'P':
-                    {   argument++;
-                        g_compressibility = (double)readU32FromChar(&argument) / 100.;
-                    }
+                    argument++;
+                    g_compressibility = (double)readU32FromChar(&argument) / 100.;
                     break;
                 case 'l':
-                    {   argument++;
-                        cLevel = readU32FromChar(&argument);
-                        cparams = ZSTD_getCParams(cLevel, 0, 0);
-                    }
+                    argument++;
+                    cLevel = readU32FromChar(&argument);
+                    cparams = ZSTD_getCParams(cLevel, 0, 0);
                     break;
 
-
-
                     /* Unknown command */
                 default : return badusage(exename);
                 }
@@ -735,15 +804,15 @@ int main(int argc, const char** argv)
         }
 
         /* first provided filename is input */
-        if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
+        if (!input_filename) { input_filename=argument; filenamesStart=argNb; continue; }
     }
 
 
 
     if (filenamesStart==0)   /* no input file */
-        result = benchSample(benchNb, cLevel, &cparams);
+        result = benchSample(benchNb, cLevel, cparams);
     else
-        result = benchFiles(argv+filenamesStart, argc-filenamesStart, benchNb, cLevel, &cparams);
+        result = benchFiles(benchNb, argv+filenamesStart, argc-filenamesStart, cLevel, cparams);
 
     if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }
 
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index 6d57afa16..641357e76 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -179,13 +179,9 @@ static void FUZ_displayMallocStats(mallocCounter_t count)
         (U32)(count.totalMalloc >> 10));
 }
 
-static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+static int FUZ_mallocTests_internal(unsigned seed, double compressibility, unsigned part,
+                void* inBuffer, size_t inSize, void* outBuffer, size_t outSize)
 {
-    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
-    size_t const outSize = ZSTD_compressBound(inSize);
-    void* const inBuffer = malloc(inSize);
-    void* const outBuffer = malloc(outSize);
-
     /* test only played in verbose mode, as they are long */
     if (g_displayLevel<3) return 0;
 
@@ -270,6 +266,28 @@ static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
     return 0;
 }
 
+static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+{
+    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
+    size_t const outSize = ZSTD_compressBound(inSize);
+    void* const inBuffer = malloc(inSize);
+    void* const outBuffer = malloc(outSize);
+    int result;
+
+    /* Create compressible noise */
+    if (!inBuffer || !outBuffer) {
+        DISPLAY("Not enough memory, aborting \n");
+        exit(1);
+    }
+
+    result = FUZ_mallocTests_internal(seed, compressibility, part,
+                    inBuffer, inSize, outBuffer, outSize);
+
+    free(inBuffer);
+    free(outBuffer);
+    return result;
+}
+
 #else
 
 static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
@@ -1357,6 +1375,24 @@ static int basicUnitTests(U32 seed, double compressibility)
                 ((BYTE*)CNBuffer)[i+1] = _3BytesSeqs[id][1];
                 ((BYTE*)CNBuffer)[i+2] = _3BytesSeqs[id][2];
     }   }   }
+    DISPLAYLEVEL(3, "test%3i : growing nbSeq : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const maxNbSeq = _3BYTESTESTLENGTH / 3;
+        size_t const bound = ZSTD_compressBound(_3BYTESTESTLENGTH);
+        size_t nbSeq = 1;
+        while (nbSeq <= maxNbSeq) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, nbSeq * 3, 19));
+          /* Check every sequence for the first 100, then skip more rapidly. */
+          if (nbSeq < 100) {
+            ++nbSeq;
+          } else {
+            nbSeq += (nbSeq >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3i : compress lots 3-bytes sequences : ", testNb++);
     { CHECK_V(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(_3BYTESTESTLENGTH),
                                  CNBuffer, _3BYTESTESTLENGTH, 19) );
@@ -1368,8 +1404,26 @@ static int basicUnitTests(U32 seed, double compressibility)
       if (r != _3BYTESTESTLENGTH) goto _output_error; }
     DISPLAYLEVEL(3, "OK \n");
 
-    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
+
+    DISPLAYLEVEL(3, "test%3i : growing literals buffer : ", testNb++);
     RDG_genBuffer(CNBuffer, CNBuffSize, 0.0, 0.1, seed);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const bound = ZSTD_compressBound(CNBuffSize);
+        size_t size = 1;
+        while (size <= CNBuffSize) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, size, 3));
+          /* Check every size for the first 100, then skip more rapidly. */
+          if (size < 100) {
+            ++size;
+          } else {
+            size += (size >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
     {   /* Train a dictionary on low characters */
         size_t dictSize = 16 KB;
         void* const dictBuffer = malloc(dictSize);
@@ -1535,7 +1589,6 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     size_t const dstBufferSize = (size_t)1<<maxSampleLog;
     size_t const cBufferSize   = ZSTD_compressBound(dstBufferSize);
     BYTE* cNoiseBuffer[5];
-    BYTE* srcBuffer;   /* jumping pointer */
     BYTE* const cBuffer = (BYTE*) malloc (cBufferSize);
     BYTE* const dstBuffer = (BYTE*) malloc (dstBufferSize);
     BYTE* const mirrorBuffer = (BYTE*) malloc (dstBufferSize);
@@ -1544,7 +1597,7 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     ZSTD_DCtx* const dctx = ZSTD_createDCtx();
     U32 result = 0;
     U32 testNb = 0;
-    U32 coreSeed = seed, lseed = 0;
+    U32 coreSeed = seed;
     UTIL_time_t const startClock = UTIL_getTime();
     U64 const maxClockSpan = maxDurationS * SEC_TO_MICRO;
     int const cLevelLimiter = bigTests ? 3 : 2;
@@ -1565,13 +1618,14 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     RDG_genBuffer(cNoiseBuffer[2], srcBufferSize, compressibility, 0., coreSeed);
     RDG_genBuffer(cNoiseBuffer[3], srcBufferSize, 0.95, 0., coreSeed);    /* highly compressible */
     RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
-    srcBuffer = cNoiseBuffer[2];
 
     /* catch up testNb */
     for (testNb=1; testNb < startTest; testNb++) FUZ_rand(&coreSeed);
 
     /* main test loop */
     for ( ; (testNb <= nbTests) || (UTIL_clockSpanMicro(startClock) < maxClockSpan); testNb++ ) {
+        BYTE* srcBuffer;   /* jumping pointer */
+        U32 lseed;
         size_t sampleSize, maxTestSize, totalTestSize;
         size_t cSize, totalCSize, totalGenSize;
         U64 crcOrig;
@@ -1802,11 +1856,9 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
         CHECK (totalGenSize != totalTestSize, "streaming decompressed data : wrong size")
         CHECK (totalCSize != cSize, "compressed data should be fully read")
         {   U64 const crcDest = XXH64(dstBuffer, totalTestSize, 0);
-            if (crcDest!=crcOrig) {
-                size_t const errorPos = findDiff(mirrorBuffer, dstBuffer, totalTestSize);
-                CHECK (1, "streaming decompressed data corrupted : byte %u / %u  (%02X!=%02X)",
-                   (U32)errorPos, (U32)totalTestSize, dstBuffer[errorPos], mirrorBuffer[errorPos]);
-        }   }
+            CHECK(crcOrig != crcDest, "streaming decompressed data corrupted (pos %u / %u)",
+                (U32)findDiff(mirrorBuffer, dstBuffer, totalTestSize), (U32)totalTestSize);
+        }
     }   /* for ( ; (testNb <= nbTests) */
     DISPLAY("\r%u fuzzer tests completed   \n", testNb-1);
 
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index f42afe403..5275f10a5 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -27,7 +27,8 @@
 #include "util.h"
 #include "bench.h"
 #include "zstd_errors.h"
-#include "zstd_internal.h"
+#include "zstd_internal.h"     /* should not be needed */
+
 
 /*-************************************
 *  Constants
@@ -46,6 +47,7 @@ static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t
 static const U64 g_maxVariationTime = 60 * SEC_TO_MICRO;
 static const int g_maxNbVariations = 64;
 
+
 /*-************************************
 *  Macros
 **************************************/
@@ -68,7 +70,7 @@ static const int g_maxNbVariations = 64;
 #define FADT_MIN 0
 #define FADT_MAX ((U32)-1)
 
-#define ZSTD_TARGETLENGTH_MIN 0 
+#define ZSTD_TARGETLENGTH_MIN 0
 #define ZSTD_TARGETLENGTH_MAX 999
 
 #define WLOG_RANGE (ZSTD_WINDOWLOG_MAX - ZSTD_WINDOWLOG_MIN + 1)
@@ -90,9 +92,9 @@ static const char* g_stratName[ZSTD_btultra+1] = {
                 "ZSTD_greedy  ", "ZSTD_lazy    ", "ZSTD_lazy2   ",
                 "ZSTD_btlazy2 ", "ZSTD_btopt   ", "ZSTD_btultra "};
 
-
 static const U32 tlen_table[TLEN_RANGE] = { 0, 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 256, 512, 999 };
 
+
 /*-************************************
 *  Setup for Adding new params
 **************************************/
@@ -115,27 +117,27 @@ typedef struct {
 } paramValues_t;
 
 /* maximum value of parameters */
-static const U32 mintable[NUM_PARAMS] = 
+static const U32 mintable[NUM_PARAMS] =
         { ZSTD_WINDOWLOG_MIN, ZSTD_CHAINLOG_MIN, ZSTD_HASHLOG_MIN, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLENGTH_MIN, ZSTD_TARGETLENGTH_MIN, ZSTD_fast, FADT_MIN };
 
 /* minimum value of parameters */
-static const U32 maxtable[NUM_PARAMS] = 
+static const U32 maxtable[NUM_PARAMS] =
         { ZSTD_WINDOWLOG_MAX, ZSTD_CHAINLOG_MAX, ZSTD_HASHLOG_MAX, ZSTD_SEARCHLOG_MAX, ZSTD_SEARCHLENGTH_MAX, ZSTD_TARGETLENGTH_MAX, ZSTD_btultra, FADT_MAX };
 
 /* # of values parameters can take on */
-static const U32 rangetable[NUM_PARAMS] = 
+static const U32 rangetable[NUM_PARAMS] =
         { WLOG_RANGE, CLOG_RANGE, HLOG_RANGE, SLOG_RANGE, SLEN_RANGE, TLEN_RANGE, STRT_RANGE, FADT_RANGE };
 
 /* ZSTD_cctxSetParameter() index to set */
-static const ZSTD_cParameter cctxSetParamTable[NUM_PARAMS] = 
+static const ZSTD_cParameter cctxSetParamTable[NUM_PARAMS] =
         { ZSTD_p_windowLog, ZSTD_p_chainLog, ZSTD_p_hashLog, ZSTD_p_searchLog, ZSTD_p_minMatch, ZSTD_p_targetLength, ZSTD_p_compressionStrategy, ZSTD_p_forceAttachDict };
 
 /* names of parameters */
-static const char* g_paramNames[NUM_PARAMS] = 
+static const char* g_paramNames[NUM_PARAMS] =
         { "windowLog", "chainLog", "hashLog","searchLog", "searchLength", "targetLength", "strategy", "forceAttachDict" };
 
 /* shortened names of parameters */
-static const char* g_shortParamNames[NUM_PARAMS] = 
+static const char* g_shortParamNames[NUM_PARAMS] =
         { "wlog", "clog", "hlog","slog", "slen", "tlen", "strt", "fadt" };
 
 /* maps value from { 0 to rangetable[param] - 1 } to valid paramvalues */
@@ -178,7 +180,7 @@ static int invRangeMap(varInds_t param, U32 value) {
                     hi = mid;
                 }
             }
-            return lo;    
+            return lo;
         }
         case fadt_ind:
             return (int)value + 1;
@@ -201,17 +203,18 @@ static void displayParamVal(FILE* f, varInds_t param, U32 value, int width) {
     switch(param) {
         case fadt_ind: if(width) { fprintf(f, "%*d", width, (int)value); } else { fprintf(f, "%d", (int)value); } break;
         case strt_ind: if(width) { fprintf(f, "%*s", width, g_stratName[value]); } else { fprintf(f, "%s", g_stratName[value]); } break;
-        case wlog_ind: 
-        case clog_ind: 
-        case hlog_ind: 
-        case slog_ind: 
-        case slen_ind: 
+        case wlog_ind:
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
         case tlen_ind: if(width) { fprintf(f, "%*u", width, value); } else { fprintf(f, "%u", value); } break;
         case NUM_PARAMS:
             DISPLAY("Error, not a valid param\n "); break;
     }
 }
 
+
 /*-************************************
 *  Benchmark Parameters/Global Variables
 **************************************/
@@ -241,7 +244,7 @@ static U32 g_noSeed = 0;
 static paramValues_t g_params; /* Initialized at the beginning of main w/ emptyParams() function */
 static double g_ratioMultiplier = 5.;
 static U32 g_strictness = PARAM_UNSET; /* range 1 - 100, measure of how strict  */
-static BMK_result_t g_lvltarget;
+static BMK_benchResult_t g_lvltarget;
 
 typedef enum {
     directMap,
@@ -258,14 +261,14 @@ typedef struct {
 } memoTable_t;
 
 typedef struct {
-    BMK_result_t result;
+    BMK_benchResult_t result;
     paramValues_t params;
 } winnerInfo_t;
 
 typedef struct {
     U32 cSpeed;  /* bytes / sec */
     U32 dSpeed;
-    U32 cMem;    /* bytes */    
+    U32 cMem;    /* bytes */
 } constraint_t;
 
 typedef struct winner_ll_node winner_ll_node;
@@ -284,8 +287,9 @@ static winner_ll_node* g_winners; /* linked list sorted ascending by cSize & cSp
  * g_clockGranularity
  */
 
+
 /*-*******************************************************
-*  General Util Functions 
+*  General Util Functions
 *********************************************************/
 
 /* nullified useless params, to ensure count stats */
@@ -464,7 +468,7 @@ static void paramVariation(paramValues_t* ptr, memoTable_t* mtAll, const U32 nbC
 static paramValues_t randomParams(void)
 {
     varInds_t v; paramValues_t p;
-    for(v = 0; v <= NUM_PARAMS; v++) {
+    for(v = 0; v < NUM_PARAMS; v++) {
         p.vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
     }
     return p;
@@ -497,17 +501,20 @@ static void findClockGranularity(void) {
 **************************************/
 
 /* checks results are feasible */
-static int feasible(const BMK_result_t results, const constraint_t target) {
-    return (results.cSpeed >= target.cSpeed) && (results.dSpeed >= target.dSpeed) && (results.cMem <= target.cMem) && (!g_optmode || results.cSize <= g_lvltarget.cSize);
+static int feasible(const BMK_benchResult_t results, const constraint_t target) {
+    return (results.cSpeed >= target.cSpeed)
+        && (results.dSpeed >= target.dSpeed)
+        && (results.cMem <= target.cMem)
+        && (!g_optmode || results.cSize <= g_lvltarget.cSize);
 }
 
 /* hill climbing value for part 1 */
-/* Scoring here is a linear reward for all set constraints normalized between 0 to 1 
- * (with 0 at 0 and 1 being fully fulfilling the constraint), summed with a logarithmic 
+/* Scoring here is a linear reward for all set constraints normalized between 0 to 1
+ * (with 0 at 0 and 1 being fully fulfilling the constraint), summed with a logarithmic
  * bonus to exceeding the constraint value. We also give linear ratio for compression ratio.
- * The constant factors are experimental. 
+ * The constant factors are experimental.
  */
-static double resultScore(const BMK_result_t res, const size_t srcSize, const constraint_t target) {
+static double resultScore(const BMK_benchResult_t res, const size_t srcSize, const constraint_t target) {
     double cs = 0., ds = 0., rt, cm = 0.;
     const double r1 = 1, r2 = 0.1, rtr = 0.5;
     double ret;
@@ -516,14 +523,14 @@ static double resultScore(const BMK_result_t res, const size_t srcSize, const co
     if(target.cMem != (U32)-1) { cm = (double)target.cMem / res.cMem; }
     rt = ((double)srcSize / res.cSize);
 
-    ret = (MIN(1, cs) + MIN(1, ds)  + MIN(1, cm))*r1 + rt * rtr + 
+    ret = (MIN(1, cs) + MIN(1, ds)  + MIN(1, cm))*r1 + rt * rtr +
          (MAX(0, log(cs))+ MAX(0, log(ds))+ MAX(0, log(cm))) * r2;
 
     return ret;
 }
 
 /* calculates normalized squared euclidean distance of result1 if it is in the first quadrant relative to lvlRes */
-static double resultDistLvl(const BMK_result_t result1, const BMK_result_t lvlRes) {
+static double resultDistLvl(const BMK_benchResult_t result1, const BMK_benchResult_t lvlRes) {
     double normalizedCSpeedGain1 = (result1.cSpeed / lvlRes.cSpeed) - 1;
     double normalizedRatioGain1 = ((double)lvlRes.cSize / result1.cSize) - 1;
     if(normalizedRatioGain1 < 0 || normalizedCSpeedGain1 < 0) {
@@ -532,8 +539,8 @@ static double resultDistLvl(const BMK_result_t result1, const BMK_result_t lvlRe
     return normalizedRatioGain1 * g_ratioMultiplier + normalizedCSpeedGain1;
 }
 
-/* return true if r2 strictly better than r1 */ 
-static int compareResultLT(const BMK_result_t result1, const BMK_result_t result2, const constraint_t target, size_t srcSize) {
+/* return true if r2 strictly better than r1 */
+static int compareResultLT(const BMK_benchResult_t result1, const BMK_benchResult_t result2, const constraint_t target, size_t srcSize) {
     if(feasible(result1, target) && feasible(result2, target)) {
         if(g_optmode) {
             return resultDistLvl(result1, g_lvltarget) < resultDistLvl(result2, g_lvltarget);
@@ -547,7 +554,7 @@ static int compareResultLT(const BMK_result_t result1, const BMK_result_t result
 
 static constraint_t relaxTarget(constraint_t target) {
     target.cMem = (U32)-1;
-    target.cSpeed *= ((double)g_strictness) / 100; 
+    target.cSpeed *= ((double)g_strictness) / 100;
     target.dSpeed *= ((double)g_strictness) / 100;
     return target;
 }
@@ -598,7 +605,7 @@ static void optimizerAdjustInput(paramValues_t* pc, const size_t maxBlockSize) {
             DISPLAY("Warning: hashlog too much larger than windowLog size, adjusted to %u\n", pc->vals[hlog_ind]);
         }
     }
-    
+
     if(pc->vals[slog_ind] != PARAM_UNSET && pc->vals[clog_ind] != PARAM_UNSET) {
         if(pc->vals[slog_ind] > pc->vals[clog_ind]) {
             pc->vals[clog_ind] = pc->vals[slog_ind];
@@ -608,54 +615,61 @@ static void optimizerAdjustInput(paramValues_t* pc, const size_t maxBlockSize) {
 }
 
 static int redundantParams(const paramValues_t paramValues, const constraint_t target, const size_t maxBlockSize) {
-    return 
+    return
        (ZSTD_estimateCStreamSize_usingCParams(pvalsToCParams(paramValues)) > (size_t)target.cMem) /* Uses too much memory */
     || ((1ULL << (paramValues.vals[wlog_ind] - 1)) >= maxBlockSize && paramValues.vals[wlog_ind] != mintable[wlog_ind]) /* wlog too much bigger than src size */
     || (paramValues.vals[clog_ind] > (paramValues.vals[wlog_ind] + (paramValues.vals[strt_ind] > ZSTD_btlazy2))) /* chainLog larger than windowLog*/
     || (paramValues.vals[slog_ind] > paramValues.vals[clog_ind]) /* searchLog larger than chainLog */
     || (paramValues.vals[hlog_ind] > paramValues.vals[wlog_ind] + 1); /* hashLog larger than windowLog + 1 */
-    
+
 }
 
 /*-************************************
-*  Display Functions 
+*  Display Functions
 **************************************/
 
 static void BMK_translateAdvancedParams(FILE* f, const paramValues_t params) {
     varInds_t v;
     int first = 1;
     fprintf(f,"--zstd=");
-    for(v = 0; v < NUM_PARAMS; v++) {
-        if(g_silenceParams[v]) { continue; }
-        if(!first) { fprintf(f, ","); }
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
         fprintf(f,"%s=", g_paramNames[v]);
-        
-        if(v == strt_ind) { fprintf(f,"%u", params.vals[v]); }
+
+        if (v == strt_ind) { fprintf(f,"%u", params.vals[v]); }
         else { displayParamVal(f, v, params.vals[v], 0); }
         first = 0;
     }
     fprintf(f, "\n");
 }
 
-static void BMK_displayOneResult(FILE* f, winnerInfo_t res, const size_t srcSize) {
-            varInds_t v;
-            int first = 1;
-            res.params = cParamUnsetMin(res.params);
-            fprintf(f,"    {");
-            for(v = 0; v < NUM_PARAMS; v++) {
-                if(g_silenceParams[v]) { continue; }
-                if(!first) { fprintf(f, ","); }
-                displayParamVal(f, v, res.params.vals[v], 3);
-                first = 0;
-            }
+static void BMK_displayOneResult(FILE* f, winnerInfo_t res, const size_t srcSize)
+{
+    varInds_t v;
+    int first = 1;
+    res.params = cParamUnsetMin(res.params);
+    fprintf(f, "    {");
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
+        displayParamVal(f, v, res.params.vals[v], 3);
+        first = 0;
+    }
 
-            fprintf(f, " },     /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
-            (double)srcSize / res.result.cSize, (double)res.result.cSpeed / (1 MB), (double)res.result.dSpeed / (1 MB));
+    {   double const ratio = res.result.cSize ?
+                            (double)srcSize / res.result.cSize : 0;
+        double const cSpeedMBps = (double)res.result.cSpeed / MB_UNIT;
+        double const dSpeedMBps = (double)res.result.dSpeed / MB_UNIT;
+
+        fprintf(f, " },     /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
+                            ratio, cSpeedMBps, dSpeedMBps);
+    }
 }
 
 /* Writes to f the results of a parameter benchmark */
 /* when used with --optimize, will only print results better than previously discovered */
-static void BMK_printWinner(FILE* f, const int cLevel, const BMK_result_t result, const paramValues_t params, const size_t srcSize)
+static void BMK_printWinner(FILE* f, const int cLevel, const BMK_benchResult_t result, const paramValues_t params, const size_t srcSize)
 {
     char lvlstr[15] = "Custom Level";
     winnerInfo_t w;
@@ -668,10 +682,10 @@ static void BMK_printWinner(FILE* f, const int cLevel, const BMK_result_t result
         snprintf(lvlstr, 15, "  Level %2d  ", cLevel);
     }
 
-    if(TIMED) { 
+    if(TIMED) {
         const U64 time = UTIL_clockSpanNano(g_time);
         const U64 minutes = time / (60ULL * TIMELOOP_NANOSEC);
-        fprintf(f, "%1lu:%2lu:%05.2f - ", (unsigned long) minutes / 60,(unsigned long) minutes % 60, (double)(time - minutes * TIMELOOP_NANOSEC * 60ULL)/TIMELOOP_NANOSEC); 
+        fprintf(f, "%1lu:%2lu:%05.2f - ", (unsigned long) minutes / 60,(unsigned long) minutes % 60, (double)(time - minutes * TIMELOOP_NANOSEC * 60ULL)/TIMELOOP_NANOSEC);
     }
 
     fprintf(f, "/* %s */   ", lvlstr);
@@ -687,7 +701,7 @@ static void BMK_printWinner(FILE* f, const int cLevel, const BMK_result_t result
 #define SPEED_RESULT 4
 #define SIZE_RESULT 5
 /* maybe have epsilon-eq to limit table size? */
-static int speedSizeCompare(const BMK_result_t r1, const BMK_result_t r2) {
+static int speedSizeCompare(const BMK_benchResult_t r1, const BMK_benchResult_t r2) {
     if(r1.cSpeed < r2.cSpeed) {
         if(r1.cSize >= r2.cSize) {
             return BETTER_RESULT;
@@ -704,7 +718,7 @@ static int speedSizeCompare(const BMK_result_t r1, const BMK_result_t r2) {
 /* 0 for insertion, 1 for no insert */
 /* maintain invariant speedSizeCompare(n, n->next) = SPEED_RESULT */
 static int insertWinner(const winnerInfo_t w, const constraint_t targetConstraints) {
-    BMK_result_t r = w.result;
+    BMK_benchResult_t r = w.result;
     winner_ll_node* cur_node = g_winners;
     /* first node to insert */
     if(!feasible(r, targetConstraints)) {
@@ -735,7 +749,7 @@ static int insertWinner(const winnerInfo_t w, const constraint_t targetConstrain
                 tmp = cur_node->next;
                 cur_node->next = cur_node->next->next;
                 free(tmp);
-                break; 
+                break;
             }
             case SIZE_RESULT:
             {
@@ -754,7 +768,7 @@ static int insertWinner(const winnerInfo_t w, const constraint_t targetConstrain
                 cur_node->next = newnode;
                 return 0;
             }
-        } 
+        }
 
     }
 
@@ -792,12 +806,12 @@ static int insertWinner(const winnerInfo_t w, const constraint_t targetConstrain
             cur_node->next = newnode;
             return 0;
         }
-        default: 
+        default:
             return 1;
-    } 
+    }
 }
 
-static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_result_t result, const paramValues_t params, const constraint_t targetConstraints, const size_t srcSize)
+static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_benchResult_t result, const paramValues_t params, const constraint_t targetConstraints, const size_t srcSize)
 {
     /* global winner used for constraints */
                                     /* cSize, cSpeed, dSpeed, cMem */
@@ -814,7 +828,7 @@ static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_result_t res
             g_winner.result = result;
             g_winner.params = params;
         }
-    }  
+    }
 
     if(g_optmode && g_optimizer && (DEBUG || g_displayLevel == 3)) {
         winnerInfo_t w;
@@ -824,8 +838,8 @@ static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_result_t res
         insertWinner(w, targetConstraints);
 
         if(!DEBUG) { fprintf(f, "\033c"); }
-        fprintf(f, "\n"); 
-        
+        fprintf(f, "\n");
+
         /* the table */
         fprintf(f, "================================\n");
         for(n = g_winners; n != NULL; n = n->next) {
@@ -833,7 +847,7 @@ static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_result_t res
         }
         fprintf(f, "================================\n");
         fprintf(f, "Level Bounds: R: > %.3f AND C: < %.1f MB/s \n\n",
-            (double)srcSize / g_lvltarget.cSize, (double)g_lvltarget.cSpeed / (1 MB));
+            (double)srcSize / g_lvltarget.cSize, (double)g_lvltarget.cSpeed / MB_UNIT);
 
 
         fprintf(f, "Overall Winner: \n");
@@ -871,7 +885,7 @@ static void BMK_printWinners(FILE* f, const winnerInfo_t* winners, const size_t
 *********************************************************/
 
 typedef struct {
-    ZSTD_CCtx* ctx;
+    ZSTD_CCtx* cctx;
     const void* dictBuffer;
     size_t dictBufferSize;
     int cLevel;
@@ -881,15 +895,15 @@ typedef struct {
 static size_t local_initCCtx(void* payload) {
     const BMK_initCCtxArgs* ag = (const BMK_initCCtxArgs*)payload;
     varInds_t i;
-    ZSTD_CCtx_reset(ag->ctx);
-    ZSTD_CCtx_resetParameters(ag->ctx);
-    ZSTD_CCtx_setParameter(ag->ctx, ZSTD_p_compressionLevel, ag->cLevel);
+    ZSTD_CCtx_reset(ag->cctx);
+    ZSTD_CCtx_resetParameters(ag->cctx);
+    ZSTD_CCtx_setParameter(ag->cctx, ZSTD_p_compressionLevel, ag->cLevel);
 
     for(i = 0; i < NUM_PARAMS; i++) {
         if(ag->comprParams->vals[i] != PARAM_UNSET)
-        ZSTD_CCtx_setParameter(ag->ctx, cctxSetParamTable[i], ag->comprParams->vals[i]);
+        ZSTD_CCtx_setParameter(ag->cctx, cctxSetParamTable[i], ag->comprParams->vals[i]);
     }
-    ZSTD_CCtx_loadDictionary(ag->ctx, ag->dictBuffer, ag->dictBufferSize);
+    ZSTD_CCtx_loadDictionary(ag->cctx, ag->dictBuffer, ag->dictBufferSize);
 
     return 0;
 }
@@ -909,8 +923,8 @@ static size_t local_initDCtx(void* payload) {
 
 /* additional argument is just the context */
 static size_t local_defaultCompress(
-    const void* srcBuffer, size_t srcSize, 
-    void* dstBuffer, size_t dstSize, 
+    const void* srcBuffer, size_t srcSize,
+    void* dstBuffer, size_t dstSize,
     void* addArgs) {
     size_t moreToFlush = 1;
     ZSTD_CCtx* ctx = (ZSTD_CCtx*)addArgs;
@@ -937,8 +951,8 @@ static size_t local_defaultCompress(
 
 /* additional argument is just the context */
 static size_t local_defaultDecompress(
-    const void* srcBuffer, size_t srcSize, 
-    void* dstBuffer, size_t dstSize, 
+    const void* srcBuffer, size_t srcSize,
+    void* dstBuffer, size_t dstSize,
     void* addArgs) {
     size_t moreToFlush = 1;
     ZSTD_DCtx* dctx = (ZSTD_DCtx*)addArgs;
@@ -965,7 +979,7 @@ static size_t local_defaultDecompress(
 }
 
 /*-************************************
-*  Data Initialization Functions 
+*  Data Initialization Functions
 **************************************/
 
 typedef struct {
@@ -1041,7 +1055,7 @@ static int createBuffersFromMemory(buffers_t* buff, void * srcBuffer, const size
     buff->dstSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
 
     buff->resPtrs = (void**)calloc(maxNbBlocks, sizeof(void*));
-    buff->resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t)); 
+    buff->resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
 
     if(!buff->srcPtrs || !buff->srcSizes || !buff->dstPtrs || !buff->dstCapacities || !buff->dstSizes || !buff->resPtrs || !buff->resSizes) {
         DISPLAY("alloc error\n");
@@ -1090,18 +1104,18 @@ static int createBuffersFromMemory(buffers_t* buff, void * srcBuffer, const size
     buff->nbBlocks = blockNb;
 
     return 0;
-} 
+}
 
 /* allocates buffer's arguments. returns success / failuere */
-static int createBuffers(buffers_t* buff, const char* const * const fileNamesTable, 
+static int createBuffers(buffers_t* buff, const char* const * const fileNamesTable,
                           size_t nbFiles) {
     size_t pos = 0;
     size_t n;
     size_t totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, (U32)nbFiles);
     size_t benchedSize = MIN(BMK_findMaxMem(totalSizeToLoad * 3) / 3, totalSizeToLoad);
     size_t* fileSizes = calloc(sizeof(size_t), nbFiles);
-    void* srcBuffer = NULL; 
-    int ret = 0;  
+    void* srcBuffer = NULL;
+    int ret = 0;
 
     if(!totalSizeToLoad || !benchedSize) {
         ret = 1;
@@ -1139,7 +1153,7 @@ static int createBuffers(buffers_t* buff, const char* const * const fileNamesTab
 
         if (fileSize + pos > benchedSize) fileSize = benchedSize - pos, nbFiles=n;   /* buffer too small - stop after this file */
         {
-            char* buffer = (char*)(srcBuffer); 
+            char* buffer = (char*)(srcBuffer);
             size_t const readSize = fread((buffer)+pos, 1, (size_t)fileSize, f);
             fclose(f);
             if (readSize != (size_t)fileSize) {
@@ -1181,14 +1195,14 @@ static int createContexts(contexts_t* ctx, const char* dictFileName) {
     ctx->dictBuffer = malloc(ctx->dictSize);
 
     f = fopen(dictFileName, "rb");
-    
+
     if(!f) {
         DISPLAY("unable to open file\n");
         fclose(f);
         freeContexts(*ctx);
         return 1;
     }
-    
+
     if(ctx->dictSize > 64 MB || !(ctx->dictBuffer)) {
         DISPLAY("dictionary too large\n");
         fclose(f);
@@ -1207,7 +1221,7 @@ static int createContexts(contexts_t* ctx, const char* dictFileName) {
 }
 
 /*-************************************
-*  Optimizer Memoization Functions 
+*  Optimizer Memoization Functions
 **************************************/
 
 /* return: new length */
@@ -1218,7 +1232,7 @@ static size_t sanitizeVarArray(varInds_t* varNew, const size_t varLength, const
     for(i = 0; i < varLength; i++) {
         if( !((varArray[i] == clog_ind && strat == ZSTD_fast)
             || (varArray[i] == slog_ind && strat == ZSTD_fast)
-            || (varArray[i] == slog_ind && strat == ZSTD_dfast) 
+            || (varArray[i] == slog_ind && strat == ZSTD_dfast)
             || (varArray[i] == tlen_ind && strat != ZSTD_btopt && strat != ZSTD_btultra && strat != ZSTD_fast))) {
             varNew[j] = varArray[i];
             j++;
@@ -1305,7 +1319,7 @@ static void freeMemoTableArray(memoTable_t* const mtAll) {
 static memoTable_t* createMemoTableArray(const paramValues_t p, const varInds_t* const varyParams, const size_t varyLen, const U32 memoTableLog) {
     memoTable_t* mtAll = (memoTable_t*)calloc(sizeof(memoTable_t),(ZSTD_btultra + 1));
     ZSTD_strategy i, stratMin = ZSTD_fast, stratMax = ZSTD_btultra;
-    
+
     if(mtAll == NULL) {
         return NULL;
     }
@@ -1324,7 +1338,7 @@ static memoTable_t* createMemoTableArray(const paramValues_t p, const varInds_t*
         return mtAll;
     }
 
-    
+
     if(p.vals[strt_ind] != PARAM_UNSET) {
         stratMin = p.vals[strt_ind];
         stratMax = p.vals[strt_ind];
@@ -1348,7 +1362,7 @@ static memoTable_t* createMemoTableArray(const paramValues_t p, const varInds_t*
             return NULL;
         }
     }
-    
+
     return mtAll;
 }
 
@@ -1363,7 +1377,7 @@ static void randomConstrainedParams(paramValues_t* pc, const memoTable_t* memoTa
         int i;
         for(i = 0; i < NUM_PARAMS; i++) {
             varInds_t v = mt.varArray[i];
-            if(v == strt_ind) continue; 
+            if(v == strt_ind) continue;
             pc->vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
         }
 
@@ -1378,16 +1392,17 @@ static void randomConstrainedParams(paramValues_t* pc, const memoTable_t* memoTa
 /* Replicate functionality of benchMemAdvanced, but with pre-split src / dst buffers */
 /* The purpose is so that sufficient information is returned so that a decompression call to benchMemInvertible is possible */
 /* BMK_benchMemAdvanced(srcBuffer,srcSize, dstBuffer, dstSize, fileSizes, nbFiles, 0, &cParams, dictBuffer, dictSize, ctx, dctx, 0, "File", &adv); */
-/* nbSeconds used in same way as in BMK_advancedParams_t, as nbIters when in iterMode */
-
+/* nbSeconds used in same way as in BMK_advancedParams_t */
 /* if in decodeOnly, then srcPtr's will be compressed blocks, and uncompressedBlocks will be written to dstPtrs */
 /* dictionary nullable, nothing else though. */
-static BMK_return_t BMK_benchMemInvertible(const buffers_t buf, const contexts_t ctx, 
-                        const int cLevel, const paramValues_t* comprParams,
-                        const BMK_mode_t mode, const BMK_loopMode_t loopMode, const unsigned nbSeconds) {
-
+/* note : it would be better if this function was in bench.c, sharing code with benchMemAdvanced(), since it's technically a part of it */
+static BMK_benchOutcome_t
+BMK_benchMemInvertible( buffers_t buf, contexts_t ctx,
+                        int cLevel, const paramValues_t* comprParams,
+                        BMK_mode_t mode, unsigned nbSeconds)
+{
     U32 i;
-    BMK_return_t results = { { 0, 0., 0., 0 }, 0 } ;
+    BMK_benchResult_t bResult;
     const void *const *const srcPtrs = (const void *const *const)buf.srcPtrs;
     size_t const *const srcSizes = buf.srcSizes;
     void** const dstPtrs = buf.dstPtrs;
@@ -1402,9 +1417,12 @@ static BMK_return_t BMK_benchMemInvertible(const buffers_t buf, const contexts_t
     ZSTD_CCtx* cctx = ctx.cctx;
     ZSTD_DCtx* dctx = ctx.dctx;
 
+    /* init */
+    memset(&bResult, 0, sizeof(bResult));
+
     /* warmimg up memory */
-    for(i = 0; i < buf.nbBlocks; i++) {
-        if(mode != BMK_decodeOnly) {
+    for (i = 0; i < buf.nbBlocks; i++) {
+        if (mode != BMK_decodeOnly) {
             RDG_genBuffer(dstPtrs[i], dstCapacities[i], 0.10, 0.50, 1);
         } else {
             RDG_genBuffer(resPtrs[i], resSizes[i], 0.10, 0.50, 1);
@@ -1414,9 +1432,13 @@ static BMK_return_t BMK_benchMemInvertible(const buffers_t buf, const contexts_t
     /* Bench */
     {
         /* init args */
+        int compressionCompleted = (mode == BMK_decodeOnly);
+        int decompressionCompleted = (mode == BMK_compressOnly);
+        BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
+        BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
         BMK_initCCtxArgs cctxprep;
         BMK_initDCtxArgs dctxprep;
-        cctxprep.ctx = cctx;
+        cctxprep.cctx = cctx;
         cctxprep.dictBuffer = dictBuffer;
         cctxprep.dictBufferSize = dictBufferSize;
         cctxprep.cLevel = cLevel;
@@ -1425,130 +1447,115 @@ static BMK_return_t BMK_benchMemInvertible(const buffers_t buf, const contexts_t
         dctxprep.dictBuffer = dictBuffer;
         dctxprep.dictBufferSize = dictBufferSize;
 
-        if(loopMode == BMK_timeMode) {
-            BMK_customTimedReturn_t intermediateResultCompress;
-            BMK_customTimedReturn_t intermediateResultDecompress;  
-            BMK_timedFnState_t* timeStateCompress = BMK_createTimeState(nbSeconds);
-            BMK_timedFnState_t* timeStateDecompress = BMK_createTimeState(nbSeconds);
-            if(mode == BMK_compressOnly) {
-                intermediateResultCompress.completed = 0;
-                intermediateResultDecompress.completed = 1;
-            } else if (mode == BMK_decodeOnly) {
-                intermediateResultCompress.completed = 1;
-                intermediateResultDecompress.completed = 0;
-            } else { /* both */
-                intermediateResultCompress.completed = 0;
-                intermediateResultDecompress.completed = 0;
+        assert(timeStateCompress != NULL);
+        assert(timeStateDecompress != NULL);
+        while(!compressionCompleted) {
+            BMK_runOutcome_t const cOutcome = BMK_benchTimedFn(timeStateCompress,
+                                            &local_defaultCompress, cctx,
+                                            &local_initCCtx, &cctxprep,
+                                            nbBlocks,
+                                            srcPtrs, srcSizes,
+                                            dstPtrs, dstCapacities,
+                                            dstSizes);
+
+            if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                BMK_benchOutcome_t bOut;
+                memset(&bOut, 0, sizeof(bOut));
+                bOut.tag = 1;   /* should rather be a function or a constant */
+                BMK_freeTimedFnState(timeStateCompress);
+                BMK_freeTimedFnState(timeStateDecompress);
+                return bOut;
             }
-
-            while(!intermediateResultCompress.completed) {
-                intermediateResultCompress = BMK_benchFunctionTimed(timeStateCompress, &local_defaultCompress, (void*)cctx, &local_initCCtx, (void*)&cctxprep,
-                    nbBlocks, srcPtrs, srcSizes, dstPtrs, dstCapacities, dstSizes);
-
-                if(intermediateResultCompress.result.error) {
-                    results.error = intermediateResultCompress.result.error;
-                    BMK_freeTimeState(timeStateCompress);
-                    BMK_freeTimeState(timeStateDecompress);
-                    return results;
-                }
-                results.result.cSpeed = (srcSize * TIMELOOP_NANOSEC) / intermediateResultCompress.result.result.nanoSecPerRun;
-                results.result.cSize = intermediateResultCompress.result.result.sumOfReturn;
-            }
-
-            while(!intermediateResultDecompress.completed) {
-                intermediateResultDecompress = BMK_benchFunctionTimed(timeStateDecompress, &local_defaultDecompress, (void*)(dctx), &local_initDCtx, (void*)&dctxprep,
-                    nbBlocks, (const void* const*)dstPtrs, dstSizes, resPtrs, resSizes, NULL);
-
-                if(intermediateResultDecompress.result.error) {
-                    results.error = intermediateResultDecompress.result.error;
-                    BMK_freeTimeState(timeStateCompress);
-                    BMK_freeTimeState(timeStateDecompress);
-                    return results;
-                }
-                results.result.dSpeed = (srcSize * TIMELOOP_NANOSEC) / intermediateResultDecompress.result.result.nanoSecPerRun;
-            }
-
-            BMK_freeTimeState(timeStateCompress);
-            BMK_freeTimeState(timeStateDecompress);
-
-        } else { /* iterMode; */
-            if(mode != BMK_decodeOnly) {
-
-                BMK_customReturn_t compressionResults = BMK_benchFunction(&local_defaultCompress, (void*)cctx, &local_initCCtx, (void*)&cctxprep,
-                    nbBlocks, srcPtrs, srcSizes, dstPtrs, dstCapacities, dstSizes, nbSeconds); 
-                if(compressionResults.error) {
-                    results.error = compressionResults.error;
-                    return results;
-                }
-                if(compressionResults.result.nanoSecPerRun == 0) {
-                    results.result.cSpeed = 0;
-                } else {
-                    results.result.cSpeed = srcSize * TIMELOOP_NANOSEC / compressionResults.result.nanoSecPerRun;
-                }
-                results.result.cSize = compressionResults.result.sumOfReturn;
-            }
-
-            if(mode != BMK_compressOnly) {
-                BMK_customReturn_t decompressionResults;
-                decompressionResults = BMK_benchFunction(
-                    &local_defaultDecompress, (void*)(dctx),
-                    &local_initDCtx, (void*)&dctxprep, nbBlocks,
-                    (const void* const*)dstPtrs, dstSizes, resPtrs, resSizes, NULL,
-                    nbSeconds);
-
-                if(decompressionResults.error) {
-                    results.error = decompressionResults.error;
-                    return results;
-                }
-
-                if(decompressionResults.result.nanoSecPerRun == 0) {
-                    results.result.dSpeed = 0;
-                } else {
-                    results.result.dSpeed = srcSize * TIMELOOP_NANOSEC / decompressionResults.result.nanoSecPerRun;
-                }
+            {   BMK_runTime_t const rResult = BMK_extract_runTime(cOutcome);
+                bResult.cSpeed = (srcSize * TIMELOOP_NANOSEC) / rResult.nanoSecPerRun;
+                bResult.cSize = rResult.sumOfReturn;
             }
+            compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
         }
+
+        while (!decompressionCompleted) {
+            BMK_runOutcome_t const dOutcome = BMK_benchTimedFn(timeStateDecompress,
+                                        &local_defaultDecompress, dctx,
+                                        &local_initDCtx, &dctxprep,
+                                        nbBlocks,
+                                        (const void* const*)dstPtrs, dstSizes,
+                                        resPtrs, resSizes,
+                                        NULL);
+
+            if (!BMK_isSuccessful_runOutcome(dOutcome)) {
+                BMK_benchOutcome_t bOut;
+                memset(&bOut, 0, sizeof(bOut));
+                bOut.tag = 1;   /* should rather be a function or a constant */
+                BMK_freeTimedFnState(timeStateCompress);
+                BMK_freeTimedFnState(timeStateDecompress);
+                return bOut;
+            }
+            {   BMK_runTime_t const rResult = BMK_extract_runTime(dOutcome);
+                bResult.dSpeed = (srcSize * TIMELOOP_NANOSEC) / rResult.nanoSecPerRun;
+            }
+            decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+        }
+
+        BMK_freeTimedFnState(timeStateCompress);
+        BMK_freeTimedFnState(timeStateDecompress);
     }
+
    /* Bench */
-    results.result.cMem = (1 << (comprParams->vals[wlog_ind])) + ZSTD_sizeof_CCtx(cctx);
-    return results;
+    bResult.cMem = (1 << (comprParams->vals[wlog_ind])) + ZSTD_sizeof_CCtx(cctx);
+
+    {   BMK_benchOutcome_t bOut;
+        bOut.tag = 0;
+        bOut.internal_never_use_directly = bResult;  /* should be a function */
+        return bOut;
+    }
 }
 
-static int BMK_benchParam(BMK_result_t* resultPtr,
-                const buffers_t buf, const contexts_t ctx,
-                const paramValues_t cParams) {
-    BMK_return_t res = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, BMK_both, BMK_timeMode, 3);
-    *resultPtr = res.result;
-    return res.error;
+static int BMK_benchParam ( BMK_benchResult_t* resultPtr,
+                            buffers_t buf, contexts_t ctx,
+                            paramValues_t cParams)
+{
+    BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx,
+                                                        BASE_CLEVEL, &cParams,
+                                                        BMK_both, 3);
+    int const success = BMK_isSuccessful_benchOutcome(outcome);
+    if (!success) return 1;
+    *resultPtr = BMK_extract_benchResult(outcome);
+    return 0;
 }
 
 
-#define CBENCHMARK(conditional, resultvar, tmpret, mode, loopmode, sec) {                                       \
+#define CBENCHMARK(conditional, resultvar, tmpret, mode, sec) {                                                 \
     if(conditional) {                                                                                           \
-        BMK_return_t tmpret = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, mode, loopmode, sec);     \
-        if(tmpret.error) { DEBUGOUTPUT("Benchmarking failed\n"); return ERROR_RESULT; }                         \
-        if(mode != BMK_decodeOnly)  {                                                                           \
-            resultvar.cSpeed = tmpret.result.cSpeed;                                                            \
-            resultvar.cSize = tmpret.result.cSize;                                                              \
-            resultvar.cMem = tmpret.result.cMem;                                                                \
+        BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, mode, sec);  \
+        if (!BMK_isSuccessful_benchOutcome(outcome)) {                                                          \
+            DEBUGOUTPUT("Benchmarking failed\n");                                                               \
+            return ERROR_RESULT;                                                                                \
         }                                                                                                       \
-        if(mode != BMK_compressOnly) { resultvar.dSpeed = tmpret.result.dSpeed; }                               \
-    }                                                                                                           \
+        {   BMK_benchResult_t const tmpResult = BMK_extract_benchResult(outcome);                               \
+            if (mode != BMK_decodeOnly)  {                                                                      \
+                resultvar.cSpeed = tmpResult.cSpeed;                                                            \
+                resultvar.cSize = tmpResult.cSize;                                                              \
+                resultvar.cMem = tmpResult.cMem;                                                                \
+            }                                                                                                   \
+            if (mode != BMK_compressOnly) { resultvar.dSpeed = tmpResult.dSpeed; }                              \
+    }   }                                                                                                       \
 }
 
 /* Benchmarking which stops when we are sufficiently sure the solution is infeasible / worse than the winner */
-#define VARIANCE 1.2 
-static int allBench(BMK_result_t* resultPtr,
+#define VARIANCE 1.2
+static int allBench(BMK_benchResult_t* resultPtr,
                 const buffers_t buf, const contexts_t ctx,
                 const paramValues_t cParams,
                 const constraint_t target,
-                BMK_result_t* winnerResult, int feas) {
-    BMK_result_t resultMax, benchres;
+                BMK_benchResult_t* winnerResult, int feas)
+{
+    BMK_benchResult_t benchres;
     U64 loopDurationC = 0, loopDurationD = 0;
     double uncertaintyConstantC = 3., uncertaintyConstantD = 3.;
     double winnerRS;
+
     /* initial benchmarking, gives exact ratio and memory, warms up future runs */
-    CBENCHMARK(1, benchres, tmp, BMK_both, BMK_iterMode, 1);
+    CBENCHMARK(1, benchres, tmp, BMK_both, 2);
 
     winnerRS = resultScore(*winnerResult, buf.srcSize, target);
     DEBUGOUTPUT("WinnerScore: %f\n ", winnerRS);
@@ -1557,13 +1564,13 @@ static int allBench(BMK_result_t* resultPtr,
 
     /* calculate uncertainty in compression / decompression runs */
     if(benchres.cSpeed) {
-        loopDurationC = ((buf.srcSize * TIMELOOP_NANOSEC) / benchres.cSpeed); 
-        uncertaintyConstantC = ((loopDurationC + (double)(2 * g_clockGranularity))/loopDurationC); 
+        loopDurationC = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.cSpeed);
+        uncertaintyConstantC = ((loopDurationC + (double)(2 * g_clockGranularity))/loopDurationC);
     }
 
     if(benchres.dSpeed) {
-        loopDurationD = ((buf.srcSize * TIMELOOP_NANOSEC) / benchres.dSpeed); 
-        uncertaintyConstantD = ((loopDurationD + (double)(2 * g_clockGranularity))/loopDurationD);  
+        loopDurationD = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.dSpeed);
+        uncertaintyConstantD = ((loopDurationD + (double)(2 * g_clockGranularity))/loopDurationD);
     }
 
     /* anything with worse ratio in feas is definitely worse, discard */
@@ -1571,51 +1578,50 @@ static int allBench(BMK_result_t* resultPtr,
         return WORSE_RESULT;
     }
 
-    /* second run, if first run is too short, gives approximate cSpeed + dSpeed */
-    CBENCHMARK(loopDurationC < TIMELOOP_NANOSEC / 10, benchres, tmp, BMK_compressOnly, BMK_iterMode, 1);
-    CBENCHMARK(loopDurationD < TIMELOOP_NANOSEC / 10, benchres, tmp, BMK_decodeOnly,   BMK_iterMode, 1);
+    /* ensure all measurements last a minimum time, to reduce measurement errors */
+    assert(loopDurationC >= TIMELOOP_NANOSEC / 10);
+    assert(loopDurationD >= TIMELOOP_NANOSEC / 10);
 
     *resultPtr = benchres;
 
     /* optimistic assumption of benchres */
-    resultMax = benchres;
-    resultMax.cSpeed *= uncertaintyConstantC * VARIANCE;
-    resultMax.dSpeed *= uncertaintyConstantD * VARIANCE;
+    {   BMK_benchResult_t resultMax = benchres;
+        resultMax.cSpeed *= uncertaintyConstantC * VARIANCE;
+        resultMax.dSpeed *= uncertaintyConstantD * VARIANCE;
 
-    /* disregard infeasible results in feas mode */
-    /* disregard if resultMax < winner in infeas mode */
-    if((feas && !feasible(resultMax, target)) ||
-      (!feas && (winnerRS > resultScore(resultMax, buf.srcSize, target)))) {
-        return WORSE_RESULT;
+        /* disregard infeasible results in feas mode */
+        /* disregard if resultMax < winner in infeas mode */
+        if((feas && !feasible(resultMax, target)) ||
+          (!feas && (winnerRS > resultScore(resultMax, buf.srcSize, target)))) {
+            return WORSE_RESULT;
+        }
     }
 
-    CBENCHMARK(loopDurationC < TIMELOOP_NANOSEC, benchres, tmp, BMK_compressOnly, BMK_timeMode, 1);
-    CBENCHMARK(loopDurationD < TIMELOOP_NANOSEC, benchres, tmp, BMK_decodeOnly,   BMK_timeMode, 1);
-
     *resultPtr = benchres;
 
     /* compare by resultScore when in infeas */
     /* compare by compareResultLT when in feas */
-    if((!feas && (resultScore(benchres, buf.srcSize, target) > resultScore(*winnerResult, buf.srcSize, target))) || 
-       (feas && (compareResultLT(*winnerResult, benchres, target, buf.srcSize))) )  { 
-        return BETTER_RESULT; 
-    } else { 
-        return WORSE_RESULT; 
+    if((!feas && (resultScore(benchres, buf.srcSize, target) > resultScore(*winnerResult, buf.srcSize, target))) ||
+       (feas && (compareResultLT(*winnerResult, benchres, target, buf.srcSize))) )  {
+        return BETTER_RESULT;
+    } else {
+        return WORSE_RESULT;
     }
 }
 
+
 #define INFEASIBLE_THRESHOLD 200
 /* Memoized benchmarking, won't benchmark anything which has already been benchmarked before. */
-static int benchMemo(BMK_result_t* resultPtr,
-                const buffers_t buf, const contexts_t ctx, 
+static int benchMemo(BMK_benchResult_t* resultPtr,
+                const buffers_t buf, const contexts_t ctx,
                 const paramValues_t cParams,
                 const constraint_t target,
-                BMK_result_t* winnerResult, memoTable_t* const memoTableArray,
+                BMK_benchResult_t* winnerResult, memoTable_t* const memoTableArray,
                 const int feas) {
     static int bmcount = 0;
     int res;
 
-    if(memoTableGet(memoTableArray, cParams) >= INFEASIBLE_THRESHOLD || redundantParams(cParams, target, buf.maxBlockSize)) { return WORSE_RESULT; } 
+    if(memoTableGet(memoTableArray, cParams) >= INFEASIBLE_THRESHOLD || redundantParams(cParams, target, buf.maxBlockSize)) { return WORSE_RESULT; }
 
     res = allBench(resultPtr, buf, ctx, cParams, target, winnerResult, feas);
 
@@ -1631,6 +1637,7 @@ static int benchMemo(BMK_result_t* resultPtr,
     return res;
 }
 
+
 typedef struct {
     U64 cSpeed_min;
     U64 dSpeed_min;
@@ -1659,10 +1666,10 @@ static void BMK_init_level_constraints(int bytePerSec_level1)
     }   }
 }
 
-static int BMK_seed(winnerInfo_t* winners, const paramValues_t params, 
+static int BMK_seed(winnerInfo_t* winners, const paramValues_t params,
                     const buffers_t buf, const contexts_t ctx)
 {
-    BMK_result_t testResult;
+    BMK_benchResult_t testResult;
     int better = 0;
     int cLevel;
 
@@ -1729,16 +1736,16 @@ static int BMK_seed(winnerInfo_t* winners, const paramValues_t params,
                 /* too large compression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Compression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
-                         W_ratio, (double)testResult.cSpeed / (1 MB),
-                         O_ratio, (double)winners[cLevel].result.cSpeed / (1 MB),   cLevel);
+                         W_ratio, (double)testResult.cSpeed / MB_UNIT,
+                         O_ratio, (double)winners[cLevel].result.cSpeed / MB_UNIT,   cLevel);
                 continue;
             }
             if (W_DSpeed_note   < O_DSpeed_note  ) {
                 /* too large decompression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Decompression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
-                         W_ratio, (double)testResult.dSpeed / (1 MB),
-                         O_ratio, (double)winners[cLevel].result.dSpeed / (1 MB),   cLevel);
+                         W_ratio, (double)testResult.dSpeed / MB_UNIT,
+                         O_ratio, (double)winners[cLevel].result.dSpeed / MB_UNIT,   cLevel);
                 continue;
             }
 
@@ -1781,7 +1788,7 @@ static void playAround(FILE* f, winnerInfo_t* winners,
 
         if (nbVariations++ > g_maxNbVariations) break;
 
-        do { for(i = 0; i < 4; i++) { paramVaryOnce(FUZ_rand(&g_rand) % (strt_ind + 1), ((FUZ_rand(&g_rand) & 1) << 1) - 1, &p); } } 
+        do { for(i = 0; i < 4; i++) { paramVaryOnce(FUZ_rand(&g_rand) % (strt_ind + 1), ((FUZ_rand(&g_rand) & 1) << 1) - 1, &p); } }
         while(!paramValid(p));
 
         /* exclude faster if already played params */
@@ -1815,7 +1822,7 @@ static void BMK_selectRandomStart(
     }
 }
 
-static void BMK_benchFullTable(const buffers_t buf, const contexts_t ctx) 
+static void BMK_benchFullTable(const buffers_t buf, const contexts_t ctx)
 {
     paramValues_t params;
     winnerInfo_t winners[NB_LEVELS_TRACKED+1];
@@ -1828,11 +1835,11 @@ static void BMK_benchFullTable(const buffers_t buf, const contexts_t ctx)
     if (f==NULL) { DISPLAY("error opening %s \n", rfName); exit(1); }
 
     if (g_target) {
-        BMK_init_level_constraints(g_target * (1 MB));
+        BMK_init_level_constraints(g_target * MB_UNIT);
     } else {
         /* baseline config for level 1 */
         paramValues_t const l1params = cParamsToPVals(ZSTD_getCParams(1, buf.maxBlockSize, ctx.dictSize));
-        BMK_result_t testResult;
+        BMK_benchResult_t testResult;
         BMK_benchParam(&testResult, buf, ctx, l1params);
         BMK_init_level_constraints((int)((testResult.cSpeed * 31) / 32));
     }
@@ -1861,15 +1868,16 @@ static void BMK_benchFullTable(const buffers_t buf, const contexts_t ctx)
     fclose(f);
 }
 
+
 /*-************************************
 *  Single Benchmark Functions
 **************************************/
 
 static int benchOnce(const buffers_t buf, const contexts_t ctx, const int cLevel) {
-    BMK_result_t testResult;
+    BMK_benchResult_t testResult;
     g_params = adjustParams(overwriteParams(cParamsToPVals(ZSTD_getCParams(cLevel, buf.maxBlockSize, ctx.dictSize)), g_params), buf.maxBlockSize, ctx.dictSize);
 
-    if(BMK_benchParam(&testResult, buf, ctx, g_params)) {
+    if (BMK_benchParam(&testResult, buf, ctx, g_params)) {
         DISPLAY("Error during benchmarking\n");
         return 1;
     }
@@ -1883,12 +1891,12 @@ static int benchSample(double compressibility, int cLevel)
 {
     const char* const name = "Sample 10MB";
     size_t const benchedSize = 10 MB;
-    void* srcBuffer = malloc(benchedSize);
+    void* const srcBuffer = malloc(benchedSize);
     int ret = 0;
 
     buffers_t buf;
     contexts_t ctx;
-    
+
     if(srcBuffer == NULL) {
         DISPLAY("Out of Memory\n");
         return 2;
@@ -1927,31 +1935,32 @@ static int benchSample(double compressibility, int cLevel)
 /* benchFiles() :
  * note: while this function takes a table of filenames,
  * in practice, only the first filename will be used */
-int benchFiles(const char** fileNamesTable, int nbFiles, const char* dictFileName, const int cLevel)
+int benchFiles(const char** fileNamesTable, int nbFiles,
+               const char* dictFileName, int cLevel)
 {
     buffers_t buf;
     contexts_t ctx;
     int ret = 0;
 
-    if(createBuffers(&buf, fileNamesTable, nbFiles)) {
+    if (createBuffers(&buf, fileNamesTable, nbFiles)) {
         DISPLAY("unable to load files\n");
         return 1;
     }
 
-    if(createContexts(&ctx, dictFileName)) {
+    if (createContexts(&ctx, dictFileName)) {
         DISPLAY("unable to load dictionary\n");
         freeBuffers(buf);
         return 2;
     }
 
     DISPLAY("\r%79s\r", "");
-    if(nbFiles == 1) {
+    if (nbFiles == 1) {
         DISPLAY("using %s : \n", fileNamesTable[0]);
     } else {
         DISPLAY("using %d Files : \n", nbFiles);
     }
 
-    if(g_singleRun) {
+    if (g_singleRun) {
         ret = benchOnce(buf, ctx, cLevel);
     } else {
         BMK_benchFullTable(buf, ctx);
@@ -1967,12 +1976,12 @@ int benchFiles(const char** fileNamesTable, int nbFiles, const char* dictFileNam
 *  Local Optimization Functions
 **************************************/
 
-/* One iteration of hill climbing. Specifically, it first tries all 
+/* One iteration of hill climbing. Specifically, it first tries all
  * valid parameter configurations w/ manhattan distance 1 and picks the best one
  * failing that, it progressively tries candidates further and further away (up to #dim + 2)
- * if it finds a candidate exceeding winnerInfo, it will repeat. Otherwise, it will stop the 
- * current stage of hill climbing. 
- * Each iteration of hill climbing proceeds in 2 'phases'. Phase 1 climbs according to 
+ * if it finds a candidate exceeding winnerInfo, it will repeat. Otherwise, it will stop the
+ * current stage of hill climbing.
+ * Each iteration of hill climbing proceeds in 2 'phases'. Phase 1 climbs according to
  * the resultScore function, which is effectively a linear increase in reward until it reaches
  * the constraint-satisfying value, it which point any excess results in only logarithmic reward.
  * This aims to find some constraint-satisfying point.
@@ -1980,14 +1989,15 @@ int benchFiles(const char** fileNamesTable, int nbFiles, const char* dictFileNam
  * all feasible solutions valued over all infeasible solutions.
  */
 
-/* sanitize all params here. 
+/* sanitize all params here.
  * all generation after random should be sanitized. (maybe sanitize random)
  */
-static winnerInfo_t climbOnce(const constraint_t target, 
-                memoTable_t* mtAll, 
+static winnerInfo_t climbOnce(const constraint_t target,
+                memoTable_t* mtAll,
                 const buffers_t buf, const contexts_t ctx,
-                const paramValues_t init) {
-    /* 
+                const paramValues_t init)
+{
+    /*
      * cparam - currently considered 'center'
      * candidate - params to benchmark/results
      * winner - best option found so far.
@@ -2000,11 +2010,9 @@ static winnerInfo_t climbOnce(const constraint_t target,
     winnerInfo = initWinnerInfo(init);
     candidateInfo = winnerInfo;
 
-    {
-        winnerInfo_t bestFeasible1 = initWinnerInfo(cparam);
+    {   winnerInfo_t bestFeasible1 = initWinnerInfo(cparam);
         DEBUGOUTPUT("Climb Part 1\n");
         while(better) {
-
             int offset;
             size_t i, dist;
             const size_t varLen = mtAll[cparam.vals[strt_ind]].varLen;
@@ -2013,12 +2021,12 @@ static winnerInfo_t climbOnce(const constraint_t target,
             cparam = winnerInfo.params;
             candidateInfo.params = cparam;
              /* all dist-1 candidates */
-            for(i = 0; i < varLen; i++) {
-                for(offset = -1; offset <= 1; offset += 2) {
+            for (i = 0; i < varLen; i++) {
+                for (offset = -1; offset <= 1; offset += 2) {
                     CHECKTIME(winnerInfo);
                     candidateInfo.params = cparam;
-                    paramVaryOnce(mtAll[cparam.vals[strt_ind]].varArray[i], offset, &candidateInfo.params); 
-                    
+                    paramVaryOnce(mtAll[cparam.vals[strt_ind]].varArray[i], offset, &candidateInfo.params);
+
                     if(paramValid(candidateInfo.params)) {
                         int res;
                         res = benchMemo(&candidateInfo.result, buf, ctx,
@@ -2033,7 +2041,7 @@ static winnerInfo_t climbOnce(const constraint_t target,
                         }
                     }
                 }
-            }
+            }   /* for (i = 0; i < varLen; i++) */
 
             if(better) {
                 continue;
@@ -2047,27 +2055,28 @@ static winnerInfo_t climbOnce(const constraint_t target,
                     /* param error checking already done here */
                     paramVariation(&candidateInfo.params, mtAll, (U32)dist);
 
-                    res = benchMemo(&candidateInfo.result, buf, ctx, 
-                        sanitizeParams(candidateInfo.params), target, &winnerInfo.result, mtAll, feas);
+                    res = benchMemo(&candidateInfo.result,
+                                buf, ctx,
+                                sanitizeParams(candidateInfo.params), target,
+                                &winnerInfo.result, mtAll, feas);
                     DEBUGOUTPUT("Res: %d\n", res);
-                    if(res == BETTER_RESULT) { /* synonymous with better in this case*/
+                    if (res == BETTER_RESULT) { /* synonymous with better in this case*/
                         winnerInfo = candidateInfo;
                         better = 1;
-                        if(compareResultLT(bestFeasible1.result, winnerInfo.result, target, buf.srcSize)) {
+                        if (compareResultLT(bestFeasible1.result, winnerInfo.result, target, buf.srcSize)) {
                             bestFeasible1 = winnerInfo;
                         }
                         break;
                     }
-
                 }
-                if(better) {
+
+                if (better) {
                     break;
                 }
-            }
-
-            if(!better) { /* infeas -> feas -> stop */ 
-                if(feas) { return winnerInfo; } 
+            }   /* for(dist = 2; dist < varLen + 2; dist++) */
 
+            if (!better) { /* infeas -> feas -> stop */
+                if (feas) return winnerInfo;
                 feas = 1;
                 better = 1;
                 winnerInfo = bestFeasible1; /* note with change, bestFeasible may not necessarily be feasible, but if one has been benchmarked, it will be. */
@@ -2084,17 +2093,17 @@ static winnerInfo_t climbOnce(const constraint_t target,
 
 /* flexible parameters: iterations of failed climbing (or if we do non-random, maybe this is when everything is close to visitied)
    weight more on visit for bad results, less on good results/more on later results / ones with more failures.
-   allocate memoTable here. 
+   allocate memoTable here.
  */
 static winnerInfo_t optimizeFixedStrategy(
-    const buffers_t buf, const contexts_t ctx, 
+    const buffers_t buf, const contexts_t ctx,
     const constraint_t target, paramValues_t paramTarget,
-    const ZSTD_strategy strat, 
+    const ZSTD_strategy strat,
     memoTable_t* memoTableArray, const int tries) {
     int i = 0;
 
     paramValues_t init;
-    winnerInfo_t winnerInfo, candidateInfo; 
+    winnerInfo_t winnerInfo, candidateInfo;
     winnerInfo = initWinnerInfo(emptyParams());
     /* so climb is given the right fixed strategy */
     paramTarget.vals[strt_ind] = strat;
@@ -2104,7 +2113,7 @@ static winnerInfo_t optimizeFixedStrategy(
     init = paramTarget;
 
     for(i = 0; i < tries; i++) {
-        DEBUGOUTPUT("Restart\n"); 
+        DEBUGOUTPUT("Restart\n");
         do { randomConstrainedParams(&init, memoTableArray, strat); } while(redundantParams(init, target, buf.maxBlockSize));
         candidateInfo = climbOnce(target, memoTableArray, buf, ctx, init);
         if(compareResultLT(winnerInfo.result, candidateInfo.result, target, buf.srcSize)) {
@@ -2153,9 +2162,9 @@ static int nextStrategy(const int currentStrategy, const int bestStrategy) {
 
 /* main fn called when using --optimize */
 /* Does strategy selection by benchmarking default compression levels
- * then optimizes by strategy, starting with the best one and moving 
+ * then optimizes by strategy, starting with the best one and moving
  * progressively moving further away by number
- * args: 
+ * args:
  * fileNamesTable - list of files to benchmark
  * nbFiles - length of fileNamesTable
  * dictFileName - name of dictionary file if one, else NULL
@@ -2167,7 +2176,7 @@ static int nextStrategy(const int currentStrategy, const int bestStrategy) {
 static int g_maxTries = 5;
 #define TRY_DECAY 1
 
-static int optimizeForSize(const char* const * const fileNamesTable, const size_t nbFiles, const char* dictFileName, constraint_t target, paramValues_t paramTarget, 
+static int optimizeForSize(const char* const * const fileNamesTable, const size_t nbFiles, const char* dictFileName, constraint_t target, paramValues_t paramTarget,
     const int cLevelOpt, const int cLevelRun, const U32 memoTableLog)
 {
     varInds_t varArray [NUM_PARAMS];
@@ -2194,7 +2203,7 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
     if(nbFiles == 1) {
         DISPLAYLEVEL(2, "Loading %s...       \r", fileNamesTable[0]);
     } else {
-        DISPLAYLEVEL(2, "Loading %lu Files...       \r", (unsigned long)nbFiles); 
+        DISPLAYLEVEL(2, "Loading %lu Files...       \r", (unsigned long)nbFiles);
     }
 
     /* sanitize paramTarget */
@@ -2203,14 +2212,14 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
 
     allMT = createMemoTableArray(paramTarget, varArray, varLen, memoTableLog);
 
-    if(!allMT) {
+    if (!allMT) {
         DISPLAY("MemoTable Init Error\n");
         ret = 2;
         goto _cleanUp;
     }
 
     /* default strictnesses */
-    if(g_strictness == PARAM_UNSET) {
+    if (g_strictness == PARAM_UNSET) {
         if(g_optmode) {
             g_strictness = 100;
         } else {
@@ -2225,29 +2234,29 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
     }
 
     /* use level'ing mode instead of normal target mode */
-    if(g_optmode) {
+    if (g_optmode) {
         winner.params = cParamsToPVals(ZSTD_getCParams(cLevelOpt, buf.maxBlockSize, ctx.dictSize));
         if(BMK_benchParam(&winner.result, buf, ctx, winner.params)) {
             ret = 3;
             goto _cleanUp;
         }
- 
-        g_lvltarget = winner.result; 
+
+        g_lvltarget = winner.result;
         g_lvltarget.cSpeed *= ((double)g_strictness) / 100;
         g_lvltarget.dSpeed *= ((double)g_strictness) / 100;
         g_lvltarget.cSize /= ((double)g_strictness) / 100;
 
-        target.cSpeed = (U32)g_lvltarget.cSpeed;  
-        target.dSpeed = (U32)g_lvltarget.dSpeed; 
+        target.cSpeed = (U32)g_lvltarget.cSpeed;
+        target.dSpeed = (U32)g_lvltarget.dSpeed;
 
         BMK_printWinnerOpt(stdout, cLevelOpt, winner.result, winner.params, target, buf.srcSize);
     }
 
     /* Don't want it to return anything worse than the best known result */
-    if(g_singleRun) {
-        BMK_result_t res;
+    if (g_singleRun) {
+        BMK_benchResult_t res;
         g_params = adjustParams(overwriteParams(cParamsToPVals(ZSTD_getCParams(cLevelRun, buf.maxBlockSize, ctx.dictSize)), g_params), buf.maxBlockSize, ctx.dictSize);
-        if(BMK_benchParam(&res, buf, ctx, g_params)) {
+        if (BMK_benchParam(&res, buf, ctx, g_params)) {
             ret = 45;
             goto _cleanUp;
         }
@@ -2272,8 +2281,7 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
     DISPLAYLEVEL(2, "\n");
     findClockGranularity();
 
-    {   
-        paramValues_t CParams;
+    {   paramValues_t CParams;
 
         /* find best solution from default params */
         {
@@ -2281,7 +2289,7 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
             const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
             DEBUGOUTPUT("Strategy Selection\n");
             if(paramTarget.vals[strt_ind] == PARAM_UNSET) {
-                BMK_result_t candidate;
+                BMK_benchResult_t candidate;
                 int i;
                 for (i=1; i<=maxSeeds; i++) {
                     int ec;
@@ -2305,13 +2313,13 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
 
         DEBUGOUTPUT("Real Opt\n");
         /* start 'real' optimization */
-        {   
+        {
             int bestStrategy = (int)winner.params.vals[strt_ind];
             if(paramTarget.vals[strt_ind] == PARAM_UNSET) {
                 int st = bestStrategy;
                 int tries = g_maxTries;
 
-                { 
+                {
                     /* one iterations of hill climbing with the level-defined parameters. */
                     winnerInfo_t w1 = climbOnce(target, allMT, buf, ctx, winner.params);
                     if(compareResultLT(winner.result, w1.result, target, buf.srcSize)) {
@@ -2323,7 +2331,7 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
                 while(st && tries > 0) {
                     winnerInfo_t wc;
                     DEBUGOUTPUT("StrategySwitch: %s\n", g_stratName[st]);
-                    
+
                     wc = optimizeFixedStrategy(buf, ctx, target, paramBase, st, allMT, tries);
 
                     if(compareResultLT(winner.result, wc.result, target, buf.srcSize)) {
@@ -2349,7 +2357,7 @@ static int optimizeForSize(const char* const * const fileNamesTable, const size_
             goto _cleanUp;
         }
         /* end summary */
-_displayCleanUp: 
+_displayCleanUp:
         if(g_displayLevel >= 0) { BMK_displayOneResult(stdout, winner, buf.srcSize); }
         BMK_translateAdvancedParams(stdout, winner.params);
         DISPLAYLEVEL(1, "grillParams size - optimizer completed \n");
@@ -2427,7 +2435,7 @@ static double readDoubleFromChar(const char** stringPtr)
     }
     (*stringPtr)++;
     while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        result += (double)(**stringPtr - '0') / divide, divide *= 10, (*stringPtr)++ ; 
+        result += (double)(**stringPtr - '0') / divide, divide *= 10, (*stringPtr)++ ;
     }
     return result;
 }
@@ -2503,7 +2511,7 @@ int main(int argc, const char** argv)
     int seperateFiles = 0;
     double compressibility = COMPRESSIBILITY_DEFAULT;
     U32 memoTableLog = PARAM_UNSET;
-    constraint_t target = { 0, 0, (U32)-1 }; 
+    constraint_t target = { 0, 0, (U32)-1 };
 
     paramValues_t paramTarget = emptyParams();
     g_params = emptyParams();
@@ -2522,7 +2530,7 @@ int main(int argc, const char** argv)
             for ( ; ;) {
                 if(parse_params(&argument, &paramTarget)) { if(argument[0] == ',') { argument++; continue; } else break; }
                 PARSE_SUB_ARGS("compressionSpeed=" ,  "cSpeed=", target.cSpeed);
-                PARSE_SUB_ARGS("decompressionSpeed=", "dSpeed=", target.dSpeed); 
+                PARSE_SUB_ARGS("decompressionSpeed=", "dSpeed=", target.dSpeed);
                 PARSE_SUB_ARGS("compressionMemory=" , "cMem=", target.cMem);
                 PARSE_SUB_ARGS("strict=", "stc=", g_strictness);
                 PARSE_SUB_ARGS("maxTries=", "tries=", g_maxTries);
@@ -2701,7 +2709,7 @@ int main(int argc, const char** argv)
 
                 /* load dictionary file (only applicable for optimizer rn) */
                 case 'D':
-                    if(i == argc - 1) { /* last argument, return error. */ 
+                    if(i == argc - 1) { /* last argument, return error. */
                         DISPLAY("Dictionary file expected but not given : %d\n", i);
                         return 1;
                     } else {
@@ -2749,7 +2757,7 @@ int main(int argc, const char** argv)
             } else {
                 result = benchFiles(argv+filenamesStart, argc-filenamesStart, dictFileName, cLevelRun);
             }
-        }   
+        }
     }
 
     if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }
diff --git a/tests/playTests.sh b/tests/playTests.sh
index 0a1f96c02..19b502999 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -48,6 +48,10 @@ fileRoundTripTest() {
     $DIFF -q tmp.md5.1 tmp.md5.2
 }
 
+truncateLastByte() {
+	dd bs=1 count=$(($(wc -c < "$1") - 1)) if="$1" status=none
+}
+
 UNAME=$(uname)
 
 isTerminal=false
@@ -427,7 +431,7 @@ $ECHO "- Create second (different) dictionary"
 $ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
 $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
 $ECHO "- Create dictionary with short dictID"
-$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
+$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
 $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
@@ -444,6 +448,47 @@ $ZSTD --train-cover *.c ../programs/*.c
 test -f dictionary
 rm tmp* dictionary
 
+
+$ECHO "\n===>  fastCover dictionary builder : advanced options "
+
+TESTFILE=../programs/zstdcli.c
+./datagen > tmpDict
+$ECHO "- Create first dictionary"
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c -o tmpDict
+cp $TESTFILE tmp
+$ZSTD -f tmp -D tmpDict
+$ZSTD -d tmp.zst -D tmpDict -fo result
+$DIFF $TESTFILE result
+$ECHO "- Create second (different) dictionary"
+$ZSTD --train-fastcover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
+$ECHO "- Create dictionary with short dictID"
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
+cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
+$ECHO "- Create dictionary with size limit"
+$ZSTD --train-fastcover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
+$ZSTD --train-fastcover=split=90 -r *.c ../programs/*.c
+$ZSTD --train-fastcover=split=80 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using all samples for both training and testing"
+$ZSTD --train-fastcover=split=100 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using f=16"
+$ZSTD --train-fastcover=f=16 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=2"
+$ZSTD --train-fastcover=accel=2 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=10"
+$ZSTD --train-fastcover=accel=10 -r *.c ../programs/*.c
+$ECHO "- Create dictionary with multithreading"
+$ZSTD --train-fastcover -T4 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-fastcover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-fastcover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-fastcover *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
+
+
 $ECHO "\n===>  legacy dictionary builder "
 
 TESTFILE=../programs/zstdcli.c
@@ -551,7 +596,7 @@ if [ $GZIPMODE -eq 1 ]; then
     $ZSTD -f --format=gzip tmp
     $ZSTD -f tmp
     cat tmp.gz tmp.zst tmp.gz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "gzip mode not supported"
@@ -618,8 +663,8 @@ if [ $LZMAMODE -eq 1 ]; then
     $ZSTD -f --format=lzma tmp
     $ZSTD -f tmp
     cat tmp.xz tmp.lzma tmp.zst tmp.lzma tmp.xz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
-    head -c -1 tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "xz mode not supported"
@@ -655,7 +700,7 @@ if [ $LZ4MODE -eq 1 ]; then
     $ZSTD -f --format=lz4 tmp
     $ZSTD -f tmp
     cat tmp.lz4 tmp.zst tmp.lz4 tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "lz4 mode not supported"
diff --git a/tests/roundTripCrash.c b/tests/roundTripCrash.c
index 7d937fcee..90afcd4b2 100644
--- a/tests/roundTripCrash.c
+++ b/tests/roundTripCrash.c
@@ -212,7 +212,7 @@ static void loadFile(void* buffer, const char* fileName, size_t fileSize)
 static void fileCheck(const char* fileName, int testCCtxParams)
 {
     size_t const fileSize = getFileSize(fileName);
-    void* buffer = malloc(fileSize);
+    void* const buffer = malloc(fileSize + !fileSize /* avoid 0 */);
     if (!buffer) {
         fprintf(stderr, "not enough memory \n");
         exit(4);
diff --git a/tests/symbols.c b/tests/symbols.c
index c0bed2e5d..b37082131 100644
--- a/tests/symbols.c
+++ b/tests/symbols.c
@@ -144,6 +144,8 @@ static const void *symbols[] = {
 /* zdict.h: advanced functions */
   &ZDICT_trainFromBuffer_cover,
   &ZDICT_optimizeTrainFromBuffer_cover,
+  &ZDICT_trainFromBuffer_fastCover,
+  &ZDICT_optimizeTrainFromBuffer_fastCover,
   &ZDICT_finalizeDictionary,
   &ZDICT_trainFromBuffer_legacy,
   &ZDICT_addEntropyTablesFromBuffer,
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index 3d61394a7..96136a625 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -135,34 +135,34 @@ typedef struct {
     size_t filled;
 } buffer_t;
 
-static const buffer_t g_nullBuffer = { NULL, 0 , 0 };
+static const buffer_t kBuffNull = { NULL, 0 , 0 };
+
+static void FUZ_freeDictionary(buffer_t dict)
+{
+    free(dict.start);
+}
 
 static buffer_t FUZ_createDictionary(const void* src, size_t srcSize, size_t blockSize, size_t requestedDictSize)
 {
-    buffer_t dict = { NULL, 0, 0 };
+    buffer_t dict = kBuffNull;
     size_t const nbBlocks = (srcSize + (blockSize-1)) / blockSize;
-    size_t* const blockSizes = (size_t*) malloc(nbBlocks * sizeof(size_t));
-    if (!blockSizes) return dict;
+    size_t* const blockSizes = (size_t*)malloc(nbBlocks * sizeof(size_t));
+    if (!blockSizes) return kBuffNull;
     dict.start = malloc(requestedDictSize);
-    if (!dict.start) { free(blockSizes); return dict; }
+    if (!dict.start) { free(blockSizes); return kBuffNull; }
     {   size_t nb;
         for (nb=0; nb<nbBlocks-1; nb++) blockSizes[nb] = blockSize;
         blockSizes[nbBlocks-1] = srcSize - (blockSize * (nbBlocks-1));
     }
     {   size_t const dictSize = ZDICT_trainFromBuffer(dict.start, requestedDictSize, src, blockSizes, (unsigned)nbBlocks);
         free(blockSizes);
-        if (ZDICT_isError(dictSize)) { free(dict.start); return g_nullBuffer; }
+        if (ZDICT_isError(dictSize)) { FUZ_freeDictionary(dict); return kBuffNull; }
         dict.size = requestedDictSize;
         dict.filled = dictSize;
-        return dict;   /* how to return dictSize ? */
+        return dict;
     }
 }
 
-static void FUZ_freeDictionary(buffer_t dict)
-{
-    free(dict.start);
-}
-
 /* Round trips data and updates xxh with the decompressed data produced */
 static size_t SEQ_roundTrip(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
                             XXH64_state_t* xxh, void* data, size_t size,
@@ -276,7 +276,7 @@ static int basicUnitTests(U32 seed, double compressibility)
 
     ZSTD_inBuffer  inBuff, inBuff2;
     ZSTD_outBuffer outBuff;
-    buffer_t dictionary = g_nullBuffer;
+    buffer_t dictionary = kBuffNull;
     size_t const dictSize = 128 KB;
     unsigned dictID = 0;
 
@@ -600,7 +600,6 @@ static int basicUnitTests(U32 seed, double compressibility)
         size_t const initError = ZSTD_initCStream_usingCDict(zc, cdict);
         DISPLAYLEVEL(5, "ZSTD_initCStream_usingCDict result : %u ", (U32)initError);
         if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
         outBuff.dst = compressedBuffer;
         outBuff.size = compressedBufferSize;
         outBuff.pos = 0;
@@ -718,7 +717,6 @@ static int basicUnitTests(U32 seed, double compressibility)
         ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem);
         size_t const initError = ZSTD_initCStream_usingCDict_advanced(zc, cdict, fParams, CNBufferSize);
         if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
         outBuff.dst = compressedBuffer;
         outBuff.size = compressedBufferSize;
         outBuff.pos = 0;
@@ -1022,6 +1020,59 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : dictionary + uncompressible block + reusing tables checks offset table validity: ", testNb++);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(
+            dictionary.start, dictionary.filled,
+            ZSTD_dlm_byRef, ZSTD_dct_fullDict,
+            ZSTD_getCParams(3, 0, dictionary.filled),
+            ZSTD_defaultCMem);
+        const size_t inbufsize = 2 * 128 * 1024; /* 2 blocks */
+        const size_t outbufsize = ZSTD_compressBound(inbufsize);
+        size_t inbufpos = 0;
+        size_t cursegmentlen;
+        BYTE *inbuf = (BYTE *)malloc(inbufsize);
+        BYTE *outbuf = (BYTE *)malloc(outbufsize);
+        BYTE *checkbuf = (BYTE *)malloc(inbufsize);
+        size_t ret;
+
+        CHECK(cdict == NULL, "failed to alloc cdict");
+        CHECK(inbuf == NULL, "failed to alloc input buffer");
+
+        /* first block is uncompressible */
+        cursegmentlen = 128 * 1024;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0., 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* second block is compressible */
+        cursegmentlen = 128 * 1024 - 256;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0.05, 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 256, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 128, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        ret = ZSTD_compress_usingCDict(zc, outbuf, outbufsize, inbuf, inbufpos, cdict);
+        CHECK_Z(ret);
+
+        ret = ZSTD_decompress_usingDict(zd, checkbuf, inbufsize, outbuf, ret, dictionary.start, dictionary.filled);
+        CHECK_Z(ret);
+
+        CHECK(memcmp(inbuf, checkbuf, inbufpos), "start and finish buffers don't match");
+
+        ZSTD_freeCDict(cdict);
+        free(inbuf);
+        free(outbuf);
+        free(checkbuf);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
 _end:
     FUZ_freeDictionary(dictionary);
     ZSTD_freeCStream(zc);
diff --git a/zlibWrapper/examples/zwrapbench.c b/zlibWrapper/examples/zwrapbench.c
index a4dfbb6e8..d2d6073f9 100644
--- a/zlibWrapper/examples/zwrapbench.c
+++ b/zlibWrapper/examples/zwrapbench.c
@@ -573,10 +573,10 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     do {
         testmem = (BYTE*)malloc((size_t)requiredMem);
         requiredMem -= step;
-    } while (!testmem);
+    } while (!testmem && requiredMem);   /* do not allocate zero bytes */
 
     free(testmem);
-    return (size_t)(requiredMem);
+    return (size_t)(requiredMem+1);  /* avoid zero */
 }
 
 static void BMK_benchCLevel(void* srcBuffer, size_t benchedSize,
@@ -734,7 +734,7 @@ static void BMK_benchFileTable(const char** fileNamesTable, unsigned nbFiles,
     if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
     if (benchedSize < totalSizeToLoad)
         DISPLAY("Not enough memory; testing %u MB only...\n", (U32)(benchedSize >> 20));
-    srcBuffer = malloc(benchedSize);
+    srcBuffer = malloc(benchedSize + !benchedSize);
     if (!srcBuffer) EXM_THROW(12, "not enough memory");
 
     /* Load input buffer */
diff --git a/zlibWrapper/gzlib.c b/zlibWrapper/gzlib.c
index 8235cff4f..3070dd8b4 100644
--- a/zlibWrapper/gzlib.c
+++ b/zlibWrapper/gzlib.c
@@ -111,7 +111,7 @@ local gzFile gz_open(path, fd, mode)
         return NULL;
 
     /* allocate gzFile structure to return */
-    state = (gz_statep)(gz_state*)malloc(sizeof(gz_state));
+    state.state = (gz_state*)malloc(sizeof(gz_state));
     if (state.state == NULL)
         return NULL;
     state.state->size = 0;            /* no buffers allocated yet */
@@ -266,7 +266,7 @@ local gzFile gz_open(path, fd, mode)
     gz_reset(state);
 
     /* return stream */
-    return (gzFile)state.file;
+    return state.file;
 }
 
 /* -- see zlib.h -- */
diff --git a/zlibWrapper/gzwrite.c b/zlibWrapper/gzwrite.c
index d1250b900..21d5f8472 100644
--- a/zlibWrapper/gzwrite.c
+++ b/zlibWrapper/gzwrite.c
@@ -6,6 +6,8 @@
  * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
  */
 
+#include <assert.h>
+
 #include "gzguts.h"
 
 /* Local functions */
@@ -24,7 +26,7 @@ local int gz_init(state)
     z_streamp strm = &(state.state->strm);
 
     /* allocate input buffer (double size for gzprintf) */
-    state.state->in = (unsigned char *)malloc(state.state->want << 1);
+    state.state->in = (unsigned char*)malloc(state.state->want << 1);
     if (state.state->in == NULL) {
         gz_error(state, Z_MEM_ERROR, "out of memory");
         return -1;
@@ -33,7 +35,7 @@ local int gz_init(state)
     /* only need output buffer and deflate state if compressing */
     if (!state.state->direct) {
         /* allocate output buffer */
-        state.state->out = (unsigned char *)malloc(state.state->want);
+        state.state->out = (unsigned char*)malloc(state.state->want);
         if (state.state->out == NULL) {
             free(state.state->in);
             gz_error(state, Z_MEM_ERROR, "out of memory");
@@ -284,6 +286,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
     gz_statep state;
 
     /* get internal structure */
+    assert(size != 0);
     if (file == NULL)
         return 0;
     state = (gz_statep)file;
@@ -294,7 +297,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
 
     /* compute bytes to read -- error on overflow */
     len = nitems * size;
-    if (size && len / size != nitems) {
+    if (size && (len / size != nitems)) {
         gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
         return 0;
     }