diff --git a/.gitattributes b/.gitattributes
index 9eb12c0ef62e..6212bd405b4a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -19,6 +19,3 @@
 # Windows
 *.bat text eol=crlf
 *.cmd text eol=crlf
-
-# .travis.yml merging
-.travis.yml merge=ours
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000000..0f7ad8bfc173
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/Makefile b/Makefile
index 320fc68071d4..c63db80e9e0c 100644
--- a/Makefile
+++ b/Makefile
@@ -23,20 +23,19 @@ else
 EXT =
 endif
 
+## default: Build lib-release and zstd-release
 .PHONY: default
 default: lib-release zstd-release
 
 .PHONY: all
-all: | allmost examples manual contrib
+all: allmost examples manual contrib
 
 .PHONY: allmost
-allmost: allzstd
-	$(MAKE) -C $(ZWRAPDIR) all
+allmost: allzstd zlibwrapper
 
-#skip zwrapper, can't build that on alternate architectures without the proper zlib installed
+# skip zwrapper, can't build that on alternate architectures without the proper zlib installed
 .PHONY: allzstd
-allzstd:
-	$(MAKE) -C $(ZSTDDIR) all
+allzstd: lib
 	$(MAKE) -C $(PRGDIR) all
 	$(MAKE) -C $(TESTDIR) all
 
@@ -45,58 +44,62 @@ all32:
 	$(MAKE) -C $(PRGDIR) zstd32
 	$(MAKE) -C $(TESTDIR) all32
 
-.PHONY: lib
-lib:
+.PHONY: lib lib-release libzstd.a
+lib lib-release :
 	@$(MAKE) -C $(ZSTDDIR) $@
 
-.PHONY: lib-release
-lib-release:
-	@$(MAKE) -C $(ZSTDDIR)
-
-.PHONY: zstd
-zstd:
+.PHONY: zstd zstd-release
+zstd zstd-release:
 	@$(MAKE) -C $(PRGDIR) $@
 	cp $(PRGDIR)/zstd$(EXT) .
 
-.PHONY: zstd-release
-zstd-release:
-	@$(MAKE) -C $(PRGDIR)
-	cp $(PRGDIR)/zstd$(EXT) .
-
 .PHONY: zstdmt
 zstdmt:
 	@$(MAKE) -C $(PRGDIR) $@
 	cp $(PRGDIR)/zstd$(EXT) ./zstdmt$(EXT)
 
 .PHONY: zlibwrapper
-zlibwrapper:
-	$(MAKE) -C $(ZWRAPDIR) test
+zlibwrapper: lib
+	$(MAKE) -C $(ZWRAPDIR) all
 
+## test: run long-duration tests
 .PHONY: test
+test: MOREFLAGS += -g -DDEBUGLEVEL=1 -Werror
 test:
-	$(MAKE) -C $(PRGDIR) allVariants MOREFLAGS+="-g -DZSTD_DEBUG=1"
+	MOREFLAGS="$(MOREFLAGS)" $(MAKE) -j -C $(PRGDIR) allVariants
 	$(MAKE) -C $(TESTDIR) $@
 
+## shortest: same as `make check`
 .PHONY: shortest
 shortest:
 	$(MAKE) -C $(TESTDIR) $@
 
+## check: run basic tests for `zstd` cli
 .PHONY: check
 check: shortest
 
+## examples: build all examples in `/examples` directory
 .PHONY: examples
-examples:
+examples: lib
 	CPPFLAGS=-I../lib LDFLAGS=-L../lib $(MAKE) -C examples/ all
 
+## manual: generate API documentation in html format
 .PHONY: manual
 manual:
 	$(MAKE) -C contrib/gen_html $@
 
+## man: generate man page
+.PHONY: man
+man:
+	$(MAKE) -C programs $@
+
+## contrib: build all supported projects in `/contrib` directory
 .PHONY: contrib
 contrib: lib
 	$(MAKE) -C contrib/pzstd all
 	$(MAKE) -C contrib/seekable_format/examples all
 	$(MAKE) -C contrib/adaptive-compression all
+	$(MAKE) -C contrib/largeNbDicts all
 
 .PHONY: cleanTabs
 cleanTabs:
@@ -113,21 +116,39 @@ clean:
 	@$(MAKE) -C contrib/pzstd $@ > $(VOID)
 	@$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
 	@$(MAKE) -C contrib/adaptive-compression $@ > $(VOID)
+	@$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
 	@$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
 	@$(RM) -r lz4
 	@echo Cleaning completed
 
 #------------------------------------------------------------------------------
-# make install is validated only for Linux, OSX, Hurd and some BSD targets
+# make install is validated only for Linux, macOS, Hurd and some BSD targets
 #------------------------------------------------------------------------------
-ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU FreeBSD DragonFly NetBSD MSYS_NT))
+ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku))
 
 HOST_OS = POSIX
-CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON
+CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON -DCMAKE_BUILD_TYPE=Release
 
+EGREP = egrep --color=never
+
+# Print a two column output of targets and their description. To add a target description, put a
+# comment in the Makefile with the format "## <TARGET>: <DESCRIPTION>".  For example:
+#
+## list: Print all targets and their descriptions (if provided)
 .PHONY: list
 list:
-	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+	@TARGETS=$$($(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null \
+		| awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' \
+		| $(EGREP) -v  -e '^[^[:alnum:]]' | sort); \
+	{ \
+	    printf "Target Name\tDescription\n"; \
+	    printf "%0.s-" {1..16}; printf "\t"; printf "%0.s-" {1..40}; printf "\n"; \
+	    for target in $$TARGETS; do \
+	        line=$$($(EGREP) "^##[[:space:]]+$$target:" $(lastword $(MAKEFILE_LIST))); \
+	        description=$$(echo $$line | awk '{i=index($$0,":"); print substr($$0,i+1)}' | xargs); \
+	        printf "$$target\t$$description\n"; \
+	    done \
+	} | column -t -s $$'\t'
 
 .PHONY: install clangtest armtest usan asan uasan
 install:
@@ -183,6 +204,7 @@ armfuzz: clean
 	CC=arm-linux-gnueabi-gcc QEMU_SYS=qemu-arm-static MOREFLAGS="-static" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) fuzztest
 
 aarch64fuzz: clean
+	ld -v
 	CC=aarch64-linux-gnu-gcc QEMU_SYS=qemu-aarch64-static MOREFLAGS="-static" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) fuzztest
 
 ppcfuzz: clean
@@ -206,7 +228,7 @@ gcc6test: clean
 
 clangtest: clean
 	clang -v
-	$(MAKE) all CXX=clang-++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
+	$(MAKE) all CXX=clang++ CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion -Wdocumentation"
 
 armtest: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
@@ -295,6 +317,9 @@ gcc6install: apt-add-repo
 gcc7install: apt-add-repo
 	APT_PACKAGES="libc6-dev-i386 gcc-multilib gcc-7 gcc-7-multilib" $(MAKE) apt-install
 
+gcc8install: apt-add-repo
+	APT_PACKAGES="libc6-dev-i386 gcc-multilib gcc-8 gcc-8-multilib" $(MAKE) apt-install
+
 gpp6install: apt-add-repo
 	APT_PACKAGES="libc6-dev-i386 g++-multilib gcc-6 g++-6 g++-6-multilib" $(MAKE) apt-install
 
@@ -326,23 +351,23 @@ cmakebuild:
 
 c90build: clean
 	$(CC) -v
-	CFLAGS="-std=c90" $(MAKE) allmost  # will fail, due to missing support for `long long`
+	CFLAGS="-std=c90 -Werror" $(MAKE) allmost  # will fail, due to missing support for `long long`
 
 gnu90build: clean
 	$(CC) -v
-	CFLAGS="-std=gnu90" $(MAKE) allmost
+	CFLAGS="-std=gnu90 -Werror" $(MAKE) allmost
 
 c99build: clean
 	$(CC) -v
-	CFLAGS="-std=c99" $(MAKE) allmost
+	CFLAGS="-std=c99 -Werror" $(MAKE) allmost
 
 gnu99build: clean
 	$(CC) -v
-	CFLAGS="-std=gnu99" $(MAKE) allmost
+	CFLAGS="-std=gnu99 -Werror" $(MAKE) allmost
 
 c11build: clean
 	$(CC) -v
-	CFLAGS="-std=c11" $(MAKE) allmost
+	CFLAGS="-std=c11 -Werror" $(MAKE) allmost
 
 bmix64build: clean
 	$(CC) -v
@@ -356,7 +381,10 @@ bmi32build: clean
 	$(CC) -v
 	CFLAGS="-O3 -mbmi -m32 -Werror" $(MAKE) -C $(TESTDIR) test
 
-staticAnalyze: clean
+# static analyzer test uses clang's scan-build
+# does not analyze zlibWrapper, due to detected issues in zlib source code
+staticAnalyze: SCANBUILD ?= scan-build
+staticAnalyze:
 	$(CC) -v
-	CPPFLAGS=-g scan-build --status-bugs -v $(MAKE) all
+	CC=$(CC) CPPFLAGS=-g $(SCANBUILD) --status-bugs -v $(MAKE) allzstd examples contrib
 endif
diff --git a/NEWS b/NEWS
index 9e903e6bcb3b..637bd93038e7 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,39 @@
+v1.3.7
+perf: slightly better decompression speed on clang (depending on hardware target)
+fix : performance of dictionary compression for small input < 4 KB at levels 9 and 10
+build: no longer build backtrace by default in release mode; restrict further automatic mode
+build: control backtrace support through build macro BACKTRACE
+misc: added man pages for zstdless and zstdgrep, by @samrussell
+
+v1.3.6
+perf: much faster dictionary builder, by @jenniferliu
+perf: faster dictionary compression on small data when using multiple contexts, by @felixhandte
+perf: faster dictionary decompression when using a very large number of dictionaries simultaneously
+cli : fix : does no longer overwrite destination when source does not exist (#1082)
+cli : new command --adapt, for automatic compression level adaptation
+api : fix : block api can be streamed with > 4 GB, reported by @catid
+api : reduced ZSTD_DDict size by 2 KB
+api : minimum negative compression level is defined, and can be queried using ZSTD_minCLevel().
+build: support Haiku target, by @korli
+build: Read Legacy format is limited to v0.5+ by default. Can be changed at compile time with macro ZSTD_LEGACY_SUPPORT.
+doc : zstd_compression_format.md updated to match wording in IETF RFC 8478
+misc: tests/paramgrill, a parameter optimizer, by @GeorgeLu97
+
+v1.3.5
+perf: much faster dictionary compression, by @felixhandte
+perf: small quality improvement for dictionary generation, by @terrelln
+perf: slightly improved high compression levels (notably level 19)
+mem : automatic memory release for long duration contexts
+cli : fix : overlapLog can be manually set
+cli : fix : decoding invalid lz4 frames
+api : fix : performance degradation for dictionary compression when using advanced API, by @terrelln
+api : change : clarify ZSTD_CCtx_reset() vs ZSTD_CCtx_resetParameters(), by @terrelln
+build: select custom libzstd scope through control macros, by @GeorgeLu97
+build: OpenBSD patch, by @bket
+build: make and make all are compatible with -j
+doc : clarify zstd_compression_format.md, updated for IETF RFC process
+misc: pzstd compatible with reproducible compilation, by @lamby
+
 v1.3.4
 perf: faster speed (especially decoding speed) on recent cpus (haswell+)
 perf: much better performance associating --long with multi-threading, by @terrelln
diff --git a/README.md b/README.md
index 58c9ab02ce9b..dc99dc0fd30a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ __Zstandard__, or `zstd` as short version, is a fast lossless compression algori
 targeting real-time compression scenarios at zlib-level and better compression ratios.
 It's backed by a very fast entropy stage, provided by [Huff0 and FSE library](https://github.com/Cyan4973/FiniteStateEntropy).
 
-The project is provided as an open-source BSD-licensed **C** library,
+The project is provided as an open-source dual [BSD](LICENSE) and [GPLv2](COPYING) licensed **C** library,
 and a command line utility producing and decoding `.zst`, `.gz`, `.xz` and `.lz4` files.
 Should your project require another programming language,
 a list of known ports and bindings is provided on [Zstandard homepage](http://www.zstd.net/#other-languages).
@@ -121,6 +121,8 @@ A `cmake` project generator is provided within `build/cmake`.
 It can generate Makefiles or other build scripts
 to create `zstd` binary, and `libzstd` dynamic and static libraries.
 
+By default, `CMAKE_BUILD_TYPE` is set to `Release`.
+
 #### Meson
 
 A Meson project is provided within `contrib/meson`.
diff --git a/TESTING.md b/TESTING.md
index 1fa5fe8c2d69..551981b14053 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -41,4 +41,4 @@ They consist of the following tests:
 - `pzstd` with asan and tsan, as well as in 32-bits mode
 - Testing `zstd` with legacy mode off
 - Testing `zbuff` (old streaming API)
-- Entire test suite and make install on OS X
+- Entire test suite and make install on macOS
diff --git a/appveyor.yml b/appveyor.yml
index 742f612069dc..2b674ce3ca1c 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -181,15 +181,15 @@
     - COMPILER: "gcc"
       HOST:     "mingw"
       PLATFORM: "x64"
-      SCRIPT:   "make allzstd"
+      SCRIPT:   "CPPFLAGS=-DDEBUGLEVEL=2 CFLAGS=-Werror make -j allzstd DEBUGLEVEL=2"
     - COMPILER: "gcc"
       HOST:     "mingw"
       PLATFORM: "x86"
-      SCRIPT:   "make allzstd"
+      SCRIPT:   "CFLAGS=-Werror make -j allzstd"
     - COMPILER: "clang"
       HOST:     "mingw"
       PLATFORM: "x64"
-      SCRIPT:   "MOREFLAGS='--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion' make allzstd"
+      SCRIPT:   "CFLAGS='--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion' make -j allzstd"
 
     - COMPILER: "visual"
       HOST:     "visual"
diff --git a/circle.yml b/circle.yml
deleted file mode 100644
index ed50d59e5d9b..000000000000
--- a/circle.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-dependencies:
-  override:
-    - sudo dpkg --add-architecture i386
-    - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; sudo apt-get -y -qq update
-    - sudo apt-get -y install gcc-powerpc-linux-gnu gcc-arm-linux-gnueabi libc6-dev-armel-cross gcc-aarch64-linux-gnu libc6-dev-arm64-cross
-
-test:
-  override:
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then cc -v; CFLAGS="-O0 -Werror" make all && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make gnu90build   && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make c99build     && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make gnu99build   && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make c11build     && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make ppc64build   && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make aarch64build && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make ppcbuild     && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make -j regressiontest && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make armbuild     && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make shortest     && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make -C tests test-legacy test-longmatch test-symbols && make clean; fi
-      :
-        parallel: true
-    - ? |
-        if [[ "$CIRCLE_NODE_INDEX" == "0" ]]                                    ; then make cxxtest      && make clean; fi &&
-        if [[ "$CIRCLE_NODE_TOTAL" < "2" ]] || [[ "$CIRCLE_NODE_INDEX" == "1" ]]; then make -C lib libzstd-nomt && make clean; fi
-      :
-        parallel: true
-
-  post:
-    - echo Circle CI tests finished
-
-  # Longer tests
-    #- make -C tests test-zstd-nolegacy && make clean
-    #- pyenv global 3.4.4; make -C tests versionsTest && make clean
-    #- make zlibwrapper         && make clean
-    #- gcc -v; make -C tests test32 MOREFLAGS="-I/usr/include/x86_64-linux-gnu" && make clean
-    #- make uasan               && make clean
-    #- make asan32              && make clean
-    #- make -C tests test32 CC=clang MOREFLAGS="-g -fsanitize=address -I/usr/include/x86_64-linux-gnu"
-  # Valgrind tests
-    #- CFLAGS="-O1 -g" make -C zlibWrapper valgrindTest && make clean
-    #- make -C tests valgrindTest && make clean
-  # ARM, AArch64, PowerPC, PowerPC64 tests
-    #- make ppctest             && make clean
-    #- make ppc64test           && make clean
-    #- make armtest             && make clean
-    #- make aarch64test         && make clean
diff --git a/contrib/adaptive-compression/Makefile b/contrib/adaptive-compression/Makefile
index c64fce954126..5a746dcd42da 100644
--- a/contrib/adaptive-compression/Makefile
+++ b/contrib/adaptive-compression/Makefile
@@ -48,7 +48,7 @@ clean:
 	@echo "finished cleaning"
 
 #-----------------------------------------------------------------------------
-# make install is validated only for Linux, OSX, BSD, Hurd and Solaris targets
+# make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #-----------------------------------------------------------------------------
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
 
diff --git a/contrib/gen_html/Makefile b/contrib/gen_html/Makefile
index 63598b8d718d..425f266c4e46 100644
--- a/contrib/gen_html/Makefile
+++ b/contrib/gen_html/Makefile
@@ -10,7 +10,7 @@
 CXXFLAGS ?= -O3
 CXXFLAGS += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow -Wstrict-aliasing=1 -Wswitch-enum -Wno-comment
 CXXFLAGS += $(MOREFLAGS)
-FLAGS   = $(CPPFLAGS) $(CXXFLAGS) $(CXXFLAGS) $(LDFLAGS)
+FLAGS   = $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS)
 
 ZSTDAPI = ../../lib/zstd.h
 ZSTDMANUAL = ../../doc/zstd_manual.html
diff --git a/contrib/meson/meson.build b/contrib/meson/meson.build
index 079c045a1174..98c9b0293007 100644
--- a/contrib/meson/meson.build
+++ b/contrib/meson/meson.build
@@ -18,6 +18,7 @@ libzstd_srcs = [
     join_paths(common_dir, 'error_private.c'),
     join_paths(common_dir, 'xxhash.c'),
     join_paths(compress_dir, 'fse_compress.c'),
+    join_paths(compress_dir, 'hist.c'),
     join_paths(compress_dir, 'huf_compress.c'),
     join_paths(compress_dir, 'zstd_compress.c'),
     join_paths(compress_dir, 'zstd_fast.c'),
@@ -130,6 +131,7 @@ test('fuzzer', fuzzer)
 if target_machine.system() != 'windows'
     paramgrill = executable('paramgrill',
                             datagen_c, join_paths(tests_dir, 'paramgrill.c'),
+                            join_paths(programs_dir, 'bench.c'),
                             include_directories: test_includes,
                             link_with: libzstd,
                             dependencies: libm)
diff --git a/contrib/pzstd/Makefile b/contrib/pzstd/Makefile
index 40531e216539..14b932297714 100644
--- a/contrib/pzstd/Makefile
+++ b/contrib/pzstd/Makefile
@@ -42,7 +42,7 @@ PZSTD_LDFLAGS   =
 EXTRA_FLAGS     =
 ALL_CFLAGS      = $(EXTRA_FLAGS) $(CPPFLAGS) $(PZSTD_CPPFLAGS) $(CFLAGS)   $(PZSTD_CFLAGS)
 ALL_CXXFLAGS    = $(EXTRA_FLAGS) $(CPPFLAGS) $(PZSTD_CPPFLAGS) $(CXXFLAGS) $(PZSTD_CXXFLAGS)
-ALL_LDFLAGS     = $(EXTRA_FLAGS) $(LDFLAGS) $(PZSTD_LDFLAGS)
+ALL_LDFLAGS     = $(EXTRA_FLAGS) $(CXXFLAGS) $(LDFLAGS) $(PZSTD_LDFLAGS)
 
 
 # gtest libraries need to go before "-lpthread" because they depend on it.
@@ -50,7 +50,7 @@ GTEST_LIB  = -L googletest/build/googlemock/gtest
 LIBS       =
 
 # Compilation commands
-LD_COMMAND  = $(CXX) $^          $(ALL_LDFLAGS) $(LIBS) -lpthread -o $@
+LD_COMMAND  = $(CXX) $^          $(ALL_LDFLAGS) $(LIBS) -pthread -o $@
 CC_COMMAND  = $(CC)  $(DEPFLAGS) $(ALL_CFLAGS)   -c $<  -o $@
 CXX_COMMAND = $(CXX) $(DEPFLAGS) $(ALL_CXXFLAGS) -c $<  -o $@
 
diff --git a/contrib/pzstd/Options.cpp b/contrib/pzstd/Options.cpp
index d9b216b42951..2123f8894c3e 100644
--- a/contrib/pzstd/Options.cpp
+++ b/contrib/pzstd/Options.cpp
@@ -18,17 +18,6 @@
 #include <thread>
 #include <vector>
 
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) ||     \
-    defined(__CYGWIN__)
-#include <io.h> /* _isatty */
-#define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
-#elif defined(_POSIX_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE) || (defined(__APPLE__) && defined(__MACH__)) || \
-      defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* https://sourceforge.net/p/predef/wiki/OperatingSystems/ */
-#include <unistd.h> /* isatty */
-#define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
-#else
-#define IS_CONSOLE(stdStream) 0
-#endif
 
 namespace pzstd {
 
@@ -85,7 +74,7 @@ void usage() {
   std::fprintf(stderr, "Usage:\n");
   std::fprintf(stderr, "  pzstd [args] [FILE(s)]\n");
   std::fprintf(stderr, "Parallel ZSTD options:\n");
-  std::fprintf(stderr, "  -p, --processes   #    : number of threads to use for (de)compression (default:%d)\n", defaultNumThreads());
+  std::fprintf(stderr, "  -p, --processes   #    : number of threads to use for (de)compression (default:<numcpus>)\n");
 
   std::fprintf(stderr, "ZSTD options:\n");
   std::fprintf(stderr, "  -#                     : # compression level (1-%d, default:%d)\n", kMaxNonUltraCompressionLevel, kDefaultCompressionLevel);
diff --git a/contrib/pzstd/Pzstd.cpp b/contrib/pzstd/Pzstd.cpp
index 1eb4ce14cf15..6c580b3bccc0 100644
--- a/contrib/pzstd/Pzstd.cpp
+++ b/contrib/pzstd/Pzstd.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  * in the COPYING file in the root directory of this source tree).
  */
+#include "platform.h"   /* Large Files support, SET_BINARY_MODE */
 #include "Pzstd.h"
 #include "SkippableFrame.h"
 #include "utils/FileSystem.h"
@@ -21,14 +22,6 @@
 #include <memory>
 #include <string>
 
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
-#  include <fcntl.h>    /* _O_BINARY */
-#  include <io.h>       /* _setmode, _isatty */
-#  define SET_BINARY_MODE(file) { if (_setmode(_fileno(file), _O_BINARY) == -1) perror("Cannot set _O_BINARY"); }
-#else
-#  include <unistd.h>   /* isatty */
-#  define SET_BINARY_MODE(file)
-#endif
 
 namespace pzstd {
 
diff --git a/contrib/seekable_format/examples/Makefile b/contrib/seekable_format/examples/Makefile
index 1847aa7e7b39..6d9562df8b47 100644
--- a/contrib/seekable_format/examples/Makefile
+++ b/contrib/seekable_format/examples/Makefile
@@ -9,13 +9,16 @@
 
 # This Makefile presumes libzstd is built, using `make` in / or /lib/
 
-LDFLAGS += ../../../lib/libzstd.a
+ZSTDLIB_PATH = ../../../lib
+ZSTDLIB_NAME = libzstd.a
+ZSTDLIB = $(ZSTDLIB_PATH)/$(ZSTDLIB_NAME)
+
 CPPFLAGS += -I../ -I../../../lib -I../../../lib/common
 
 CFLAGS ?= -O3
 CFLAGS += -g
 
-SEEKABLE_OBJS = ../zstdseek_compress.c ../zstdseek_decompress.c
+SEEKABLE_OBJS = ../zstdseek_compress.c ../zstdseek_decompress.c $(ZSTDLIB)
 
 .PHONY: default all clean test
 
@@ -23,6 +26,9 @@ default: all
 
 all: seekable_compression seekable_decompression parallel_processing
 
+$(ZSTDLIB):
+	make -C $(ZSTDLIB_PATH) $(ZSTDLIB_NAME)
+
 seekable_compression : seekable_compression.c $(SEEKABLE_OBJS)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
 
diff --git a/contrib/seekable_format/examples/seekable_compression.c b/contrib/seekable_format/examples/seekable_compression.c
index 9485bf26fc46..9a331a89531e 100644
--- a/contrib/seekable_format/examples/seekable_compression.c
+++ b/contrib/seekable_format/examples/seekable_compression.c
@@ -101,7 +101,7 @@ static void compressFile_orDie(const char* fname, const char* outName, int cLeve
     free(buffOut);
 }
 
-static const char* createOutFilename_orDie(const char* filename)
+static char* createOutFilename_orDie(const char* filename)
 {
     size_t const inL = strlen(filename);
     size_t const outL = inL + 5;
@@ -109,7 +109,7 @@ static const char* createOutFilename_orDie(const char* filename)
     memset(outSpace, 0, outL);
     strcat(outSpace, filename);
     strcat(outSpace, ".zst");
-    return (const char*)outSpace;
+    return (char*)outSpace;
 }
 
 int main(int argc, const char** argv) {
@@ -124,8 +124,9 @@ int main(int argc, const char** argv) {
     {   const char* const inFileName = argv[1];
         unsigned const frameSize = (unsigned)atoi(argv[2]);
 
-        const char* const outFileName = createOutFilename_orDie(inFileName);
+        char* const outFileName = createOutFilename_orDie(inFileName);
         compressFile_orDie(inFileName, outFileName, 5, frameSize);
+        free(outFileName);
     }
 
     return 0;
diff --git a/contrib/seekable_format/examples/seekable_decompression.c b/contrib/seekable_format/examples/seekable_decompression.c
index 9cd232922636..7050e0fa5c64 100644
--- a/contrib/seekable_format/examples/seekable_decompression.c
+++ b/contrib/seekable_format/examples/seekable_decompression.c
@@ -84,7 +84,7 @@ static void fseek_orDie(FILE* file, long int offset, int origin) {
 }
 
 
-static void decompressFile_orDie(const char* fname, unsigned startOffset, unsigned endOffset)
+static void decompressFile_orDie(const char* fname, off_t startOffset, off_t endOffset)
 {
     FILE* const fin  = fopen_orDie(fname, "rb");
     FILE* const fout = stdout;
@@ -129,8 +129,8 @@ int main(int argc, const char** argv)
 
     {
         const char* const inFilename = argv[1];
-        unsigned const startOffset = (unsigned) atoi(argv[2]);
-        unsigned const endOffset = (unsigned) atoi(argv[3]);
+        off_t const startOffset = atoll(argv[2]);
+        off_t const endOffset = atoll(argv[3]);
         decompressFile_orDie(inFilename, startOffset, endOffset);
     }
 
diff --git a/contrib/seekable_format/zstd_seekable.h b/contrib/seekable_format/zstd_seekable.h
index 438ac20149f8..7ffd1ba0a72b 100644
--- a/contrib/seekable_format/zstd_seekable.h
+++ b/contrib/seekable_format/zstd_seekable.h
@@ -6,8 +6,10 @@ extern "C" {
 #endif
 
 #include <stdio.h>
+#include "zstd.h"   /* ZSTDLIB_API */
 
-static const unsigned ZSTD_seekTableFooterSize = 9;
+
+#define ZSTD_seekTableFooterSize 9
 
 #define ZSTD_SEEKABLE_MAGICNUMBER 0x8F92EAB1
 
diff --git a/contrib/seekable_format/zstdseek_decompress.c b/contrib/seekable_format/zstdseek_decompress.c
index c67653338ace..b4c48754ef87 100644
--- a/contrib/seekable_format/zstdseek_decompress.c
+++ b/contrib/seekable_format/zstdseek_decompress.c
@@ -24,7 +24,7 @@
 #endif
 
 /* ************************************************************
-* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
+* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && _MSC_VER >= 1400
 #   define LONG_SEEK _fseeki64
@@ -56,6 +56,7 @@
 
 #include <stdlib.h> /* malloc, free */
 #include <stdio.h>  /* FILE* */
+#include <assert.h>
 
 #define XXH_STATIC_LINKING_ONLY
 #define XXH_NAMESPACE ZSTD_
@@ -88,7 +89,7 @@ static int ZSTD_seekable_read_FILE(void* opaque, void* buffer, size_t n)
     return 0;
 }
 
-static int ZSTD_seekable_seek_FILE(void* opaque, S64 offset, int origin)
+static int ZSTD_seekable_seek_FILE(void* opaque, long long offset, int origin)
 {
     int const ret = LONG_SEEK((FILE*)opaque, offset, origin);
     if (ret) return ret;
@@ -110,9 +111,9 @@ static int ZSTD_seekable_read_buff(void* opaque, void* buffer, size_t n)
     return 0;
 }
 
-static int ZSTD_seekable_seek_buff(void* opaque, S64 offset, int origin)
+static int ZSTD_seekable_seek_buff(void* opaque, long long offset, int origin)
 {
-    buffWrapper_t* buff = (buffWrapper_t*) opaque;
+    buffWrapper_t* const buff = (buffWrapper_t*) opaque;
     unsigned long long newOffset;
     switch (origin) {
     case SEEK_SET:
@@ -124,6 +125,8 @@ static int ZSTD_seekable_seek_buff(void* opaque, S64 offset, int origin)
     case SEEK_END:
         newOffset = (unsigned long long)buff->size - offset;
         break;
+    default:
+        assert(0);  /* not possible */
     }
     if (newOffset > buff->size) {
         return -1;
@@ -197,7 +200,7 @@ size_t ZSTD_seekable_free(ZSTD_seekable* zs)
  *  Performs a binary search to find the last frame with a decompressed offset
  *  <= pos
  *  @return : the frame's index */
-U32 ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, U64 pos)
+U32 ZSTD_seekable_offsetToFrameIndex(ZSTD_seekable* const zs, unsigned long long pos)
 {
     U32 lo = 0;
     U32 hi = zs->seekTable.tableLen;
@@ -222,13 +225,13 @@ U32 ZSTD_seekable_getNumFrames(ZSTD_seekable* const zs)
     return zs->seekTable.tableLen;
 }
 
-U64 ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, U32 frameIndex)
+unsigned long long ZSTD_seekable_getFrameCompressedOffset(ZSTD_seekable* const zs, U32 frameIndex)
 {
     if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE;
     return zs->seekTable.entries[frameIndex].cOffset;
 }
 
-U64 ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, U32 frameIndex)
+unsigned long long ZSTD_seekable_getFrameDecompressedOffset(ZSTD_seekable* const zs, U32 frameIndex)
 {
     if (frameIndex >= zs->seekTable.tableLen) return ZSTD_SEEKABLE_FRAMEINDEX_TOOLARGE;
     return zs->seekTable.entries[frameIndex].dOffset;
@@ -294,7 +297,6 @@ static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs)
         {   /* Allocate an extra entry at the end so that we can do size
              * computations on the last element without special case */
             seekEntry_t* entries = (seekEntry_t*)malloc(sizeof(seekEntry_t) * (numFrames + 1));
-            const BYTE* tableBase = zs->inBuff + ZSTD_skippableHeaderSize;
 
             U32 idx = 0;
             U32 pos = 8;
@@ -311,8 +313,8 @@ static size_t ZSTD_seekable_loadSeekTable(ZSTD_seekable* zs)
             /* compute cumulative positions */
             for (; idx < numFrames; idx++) {
                 if (pos + sizePerEntry > SEEKABLE_BUFF_SIZE) {
-                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE);
                     U32 const offset = SEEKABLE_BUFF_SIZE - pos;
+                    U32 const toRead = MIN(remaining, SEEKABLE_BUFF_SIZE - offset);
                     memmove(zs->inBuff, zs->inBuff + pos, offset); /* move any data we haven't read yet */
                     CHECK_IO(src.read(src.opaque, zs->inBuff+offset, toRead));
                     remaining -= toRead;
@@ -372,7 +374,7 @@ size_t ZSTD_seekable_initAdvanced(ZSTD_seekable* zs, ZSTD_seekable_customFile sr
     return 0;
 }
 
-size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, U64 offset)
+size_t ZSTD_seekable_decompress(ZSTD_seekable* zs, void* dst, size_t len, unsigned long long offset)
 {
     U32 targetFrame = ZSTD_seekable_offsetToFrameIndex(zs, offset);
     do {
diff --git a/doc/images/cdict_v136.png b/doc/images/cdict_v136.png
new file mode 100644
index 000000000000..4a6d45620a06
Binary files /dev/null and b/doc/images/cdict_v136.png differ
diff --git a/doc/images/ldmCspeed.png b/doc/images/ldmCspeed.png
deleted file mode 100644
index d3bfce4c8012..000000000000
Binary files a/doc/images/ldmCspeed.png and /dev/null differ
diff --git a/doc/images/ldmDspeed.png b/doc/images/ldmDspeed.png
deleted file mode 100644
index d5445f018ade..000000000000
Binary files a/doc/images/ldmDspeed.png and /dev/null differ
diff --git a/doc/images/linux-4.7-12-compress.png b/doc/images/linux-4.7-12-compress.png
deleted file mode 100644
index 71f3381bdef8..000000000000
Binary files a/doc/images/linux-4.7-12-compress.png and /dev/null differ
diff --git a/doc/images/linux-4.7-12-decompress.png b/doc/images/linux-4.7-12-decompress.png
deleted file mode 100644
index c99ff9809d54..000000000000
Binary files a/doc/images/linux-4.7-12-decompress.png and /dev/null differ
diff --git a/doc/images/linux-4.7-12-mt-compress.png b/doc/images/linux-4.7-12-mt-compress.png
deleted file mode 100644
index c66ac0d5e6b9..000000000000
Binary files a/doc/images/linux-4.7-12-mt-compress.png and /dev/null differ
diff --git a/doc/images/linux-git-compress.png b/doc/images/linux-git-compress.png
deleted file mode 100644
index 3fa0e88f4f16..000000000000
Binary files a/doc/images/linux-git-compress.png and /dev/null differ
diff --git a/doc/images/linux-git-decompress.png b/doc/images/linux-git-decompress.png
deleted file mode 100644
index 279ba7cf289f..000000000000
Binary files a/doc/images/linux-git-decompress.png and /dev/null differ
diff --git a/doc/images/linux-git-mt-compress.png b/doc/images/linux-git-mt-compress.png
deleted file mode 100644
index 538b8a9fc626..000000000000
Binary files a/doc/images/linux-git-mt-compress.png and /dev/null differ
diff --git a/doc/images/zstd_cdict_v1_3_5.png b/doc/images/zstd_cdict_v1_3_5.png
new file mode 100644
index 000000000000..cce67c83abbf
Binary files /dev/null and b/doc/images/zstd_cdict_v1_3_5.png differ
diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md
index 7bf36c491dc6..e562e628bc9c 100644
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
 
 ### Version
 
-0.2.6 (19/08/17)
+0.3.0 (25/09/18)
 
 
 Introduction
@@ -27,6 +27,8 @@ that is independent of CPU type, operating system,
 file system and character set, suitable for
 file compression, pipe and streaming compression,
 using the [Zstandard algorithm](http://www.zstandard.org).
+The text of the specification assumes a basic background in programming
+at the level of bits and other primitive data representations.
 
 The data can be produced or consumed,
 even for an arbitrarily long sequentially presented input data stream,
@@ -39,11 +41,6 @@ for detection of data corruption.
 The data format defined by this specification
 does not attempt to allow random access to compressed data.
 
-This specification is intended for use by implementers of software
-to compress data into Zstandard format and/or decompress data from Zstandard format.
-The text of the specification assumes a basic background in programming
-at the level of bits and other primitive data representations.
-
 Unless otherwise indicated below,
 a compliant compressor must produce data sets
 that conform to the specifications presented here.
@@ -57,6 +54,12 @@ Whenever it does not support a parameter defined in the compressed stream,
 it must produce a non-ambiguous error code and associated error message
 explaining which parameter is unsupported.
 
+This specification is intended for use by implementers of software
+to compress data into Zstandard format and/or decompress data from Zstandard format.
+The Zstandard format is supported by an open source reference implementation,
+written in portable C, and available at : https://github.com/facebook/zstd .
+
+
 ### Overall conventions
 In this document:
 - square brackets i.e. `[` and `]` are used to indicate optional fields or parameters.
@@ -69,7 +72,7 @@ A frame is completely independent, has a defined beginning and end,
 and a set of parameters which tells the decoder how to decompress it.
 
 A frame encapsulates one or multiple __blocks__.
-Each block can be compressed or not,
+Each block contains arbitrary content, which is described by its header,
 and has a guaranteed maximum content size, which depends on frame parameters.
 Unlike frames, each block depends on previous blocks for proper decoding.
 However, each block can be decompressed without waiting for its successor,
@@ -92,14 +95,14 @@ Overview
 Frames
 ------
 Zstandard compressed data is made of one or more __frames__.
-Each frame is independent and can be decompressed indepedently of other frames.
+Each frame is independent and can be decompressed independently of other frames.
 The decompressed content of multiple concatenated frames is the concatenation of
 each frame decompressed content.
 
 There are two frame formats defined by Zstandard:
   Zstandard frames and Skippable frames.
 Zstandard frames contain compressed data, while
-skippable frames contain no data and can be used for metadata.
+skippable frames contain custom user metadata.
 
 ## Zstandard frames
 The structure of a single Zstandard frame is following:
@@ -112,6 +115,11 @@ __`Magic_Number`__
 
 4 Bytes, __little-endian__ format.
 Value : 0xFD2FB528
+Note: This value was selected to be less probable to find at the beginning of some random file.
+It avoids trivial patterns (0x00, 0xFF, repeated bytes, increasing bytes, etc.),
+contains byte values outside of ASCII range,
+and doesn't map into UTF8 space.
+It reduces the chances that a text file represent this value by accident.
 
 __`Frame_Header`__
 
@@ -171,8 +179,8 @@ according to the following table:
 |`FCS_Field_Size`| 0 or 1 |  2  |  4  |  8  |
 
 When `Flag_Value` is `0`, `FCS_Field_Size` depends on `Single_Segment_flag` :
-if `Single_Segment_flag` is set, `Field_Size` is 1.
-Otherwise, `Field_Size` is 0 : `Frame_Content_Size` is not provided.
+if `Single_Segment_flag` is set, `FCS_Field_Size` is 1.
+Otherwise, `FCS_Field_Size` is 0 : `Frame_Content_Size` is not provided.
 
 __`Single_Segment_flag`__
 
@@ -196,10 +204,10 @@ depending on local limitations.
 
 __`Unused_bit`__
 
-The value of this bit should be set to zero.
-A decoder compliant with this specification version shall not interpret it.
-It might be used in a future version,
-to signal a property which is not mandatory to properly decode the frame.
+A decoder compliant with this specification version shall not interpret this bit.
+It might be used in any future version,
+to signal a property which is transparent to properly decode the frame.
+An encoder compliant with this specification version must set this bit to zero.
 
 __`Reserved_bit`__
 
@@ -218,11 +226,11 @@ __`Dictionary_ID_flag`__
 
 This is a 2-bits flag (`= FHD & 3`),
 telling if a dictionary ID is provided within the header.
-It also specifies the size of this field as `Field_Size`.
+It also specifies the size of this field as `DID_Field_Size`.
 
-|`Flag_Value`|  0  |  1  |  2  |  3  |
-| ---------- | --- | --- | --- | --- |
-|`Field_Size`|  0  |  1  |  2  |  4  |
+|`Flag_Value`    |  0  |  1  |  2  |  3  |
+| -------------- | --- | --- | --- | --- |
+|`DID_Field_Size`|  0  |  1  |  2  |  4  |
 
 #### `Window_Descriptor`
 
@@ -249,6 +257,9 @@ Window_Size = windowBase + windowAdd;
 The minimum `Window_Size` is 1 KB.
 The maximum `Window_Size` is `(1<<41) + 7*(1<<38)` bytes, which is 3.75 TB.
 
+In general, larger `Window_Size` tend to improve compression ratio,
+but at the cost of memory usage.
+
 To properly decode compressed data,
 a decoder will need to allocate a buffer of at least `Window_Size` bytes.
 
@@ -257,8 +268,8 @@ a decoder is allowed to reject a compressed frame
 which requests a memory size beyond decoder's authorized range.
 
 For improved interoperability,
-decoders are recommended to be compatible with `Window_Size <= 8 MB`,
-and encoders are recommended to not request more than 8 MB.
+it's recommended for decoders to support `Window_Size` of up to 8 MB,
+and it's recommended for encoders to not generate frame requiring `Window_Size` larger than 8 MB.
 It's merely a recommendation though,
 decoders are free to support larger or lower limits,
 depending on local limitations.
@@ -268,9 +279,10 @@ depending on local limitations.
 This is a variable size field, which contains
 the ID of the dictionary required to properly decode the frame.
 `Dictionary_ID` field is optional. When it's not present,
-it's up to the decoder to make sure it uses the correct dictionary.
+it's up to the decoder to know which dictionary to use.
 
-Field size depends on `Dictionary_ID_flag`.
+`Dictionary_ID` field size is provided by `DID_Field_Size`.
+`DID_Field_Size` is directly derived from value of `Dictionary_ID_flag`.
 1 byte can represent an ID 0-255.
 2 bytes can represent an ID 0-65535.
 4 bytes can represent an ID 0-4294967295.
@@ -280,13 +292,21 @@ It's allowed to represent a small ID (for example `13`)
 with a large 4-bytes dictionary ID, even if it is less efficient.
 
 _Reserved ranges :_
-If the frame is going to be distributed in a private environment,
-any dictionary ID can be used.
-However, for public distribution of compressed frames using a dictionary,
-the following ranges are reserved and shall not be used :
+Within private environments, any `Dictionary_ID` can be used.
+
+However, for frames and dictionaries distributed in public space,
+`Dictionary_ID` must be attributed carefully.
+Rules for public environment are not yet decided,
+but the following ranges are reserved for some future registrar :
 - low range  : `<= 32767`
 - high range : `>= (1 << 31)`
 
+Outside of these ranges, any value of `Dictionary_ID`
+which is both `>= 32768` and `< (1<<31)` can be used freely,
+even in public environment.
+
+
+
 #### `Frame_Content_Size`
 
 This is the original (uncompressed) size. This information is optional.
@@ -359,20 +379,21 @@ There are 4 block types :
 
 - `Reserved` - this is not a block.
   This value cannot be used with current version of this specification.
+  If such a value is present, it is considered corrupted data.
 
 __`Block_Size`__
 
 The upper 21 bits of `Block_Header` represent the `Block_Size`.
+`Block_Size` is the size of the block excluding the header.
+A block can contain any number of bytes (even zero), up to
+`Block_Maximum_Decompressed_Size`, which is the smallest of:
+-  Window_Size
+-  128 KB
 
-Block sizes must respect a few rules :
-- For `Compressed_Block`, `Block_Size` is always strictly less than decompressed size.
-- Block decompressed size is always <= `Window_Size`
-- Block decompressed size is always <= 128 KB.
-
-A block can contain any number of bytes (even empty),
-up to `Block_Maximum_Decompressed_Size`, which is the smallest of :
-- `Window_Size`
-- 128 KB
+A `Compressed_Block` has the extra restriction that `Block_Size` is always
+strictly less than the decompressed size.
+If this condition cannot be respected,
+the block must be sent uncompressed instead (`Raw_Block`).
 
 
 Compressed Blocks
@@ -390,10 +411,16 @@ data in [Sequence Execution](#sequence-execution)
 #### Prerequisites
 To decode a compressed block, the following elements are necessary :
 - Previous decoded data, up to a distance of `Window_Size`,
-  or all previously decoded data when `Single_Segment_flag` is set.
+  or beginning of the Frame, whichever is smaller.
 - List of "recent offsets" from previous `Compressed_Block`.
-- Decoding tables of previous `Compressed_Block` for each symbol type
-  (literals, literals lengths, match lengths, offsets).
+- The previous Huffman tree, required by `Treeless_Literals_Block` type
+- Previous FSE decoding tables, required by `Repeat_Mode`
+  for each symbol type (literals lengths, match lengths, offsets)
+
+Note that decoding tables aren't always from the previous `Compressed_Block`.
+
+- Every decoding table can come from a dictionary.
+- The Huffman tree comes from the previous `Compressed_Literals_Block`.
 
 Literals Section
 ----------------
@@ -405,11 +432,11 @@ Literals can be stored uncompressed or compressed using Huffman prefix codes.
 When compressed, an optional tree description can be present,
 followed by 1 or 4 streams.
 
-| `Literals_Section_Header` | [`Huffman_Tree_Description`] | Stream1 | [Stream2] | [Stream3] | [Stream4] |
-| ------------------------- | ---------------------------- | ------- | --------- | --------- | --------- |
+| `Literals_Section_Header` | [`Huffman_Tree_Description`] | [jumpTable] | Stream1 | [Stream2] | [Stream3] | [Stream4] |
+| ------------------------- | ---------------------------- | ----------- | ------- | --------- | --------- | --------- |
 
 
-#### `Literals_Section_Header`
+### `Literals_Section_Header`
 
 Header is in charge of describing how literals are packed.
 It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes,
@@ -460,18 +487,21 @@ For values spanning several bytes, convention is __little-endian__.
 
 __`Size_Format` for `Raw_Literals_Block` and `RLE_Literals_Block`__ :
 
-- Value ?0 : `Size_Format` uses 1 bit.
+`Size_Format` uses 1 _or_ 2 bits.
+Its value is : `Size_Format = (Literals_Section_Header[0]>>2) & 3`
+
+- `Size_Format` == 00 or 10 : `Size_Format` uses 1 bit.
                `Regenerated_Size` uses 5 bits (0-31).
-               `Literals_Section_Header` has 1 byte.
-               `Regenerated_Size = Header[0]>>3`
-- Value 01 : `Size_Format` uses 2 bits.
+               `Literals_Section_Header` uses 1 byte.
+               `Regenerated_Size = Literals_Section_Header[0]>>3`
+- `Size_Format` == 01 : `Size_Format` uses 2 bits.
                `Regenerated_Size` uses 12 bits (0-4095).
-               `Literals_Section_Header` has 2 bytes.
-               `Regenerated_Size = (Header[0]>>4) + (Header[1]<<4)`
-- Value 11 : `Size_Format` uses 2 bits.
+               `Literals_Section_Header` uses 2 bytes.
+               `Regenerated_Size = (Literals_Section_Header[0]>>4) + (Literals_Section_Header[1]<<4)`
+- `Size_Format` == 11 : `Size_Format` uses 2 bits.
                `Regenerated_Size` uses 20 bits (0-1048575).
-               `Literals_Section_Header` has 3 bytes.
-               `Regenerated_Size = (Header[0]>>4) + (Header[1]<<4) + (Header[2]<<12)`
+               `Literals_Section_Header` uses 3 bytes.
+               `Regenerated_Size = (Literals_Section_Header[0]>>4) + (Literals_Section_Header[1]<<4) + (Literals_Section_Header[2]<<12)`
 
 Only Stream1 is present for these cases.
 Note : it's allowed to represent a short value (for example `13`)
@@ -479,66 +509,74 @@ using a long format, even if it's less efficient.
 
 __`Size_Format` for `Compressed_Literals_Block` and `Treeless_Literals_Block`__ :
 
-- Value 00 : _A single stream_.
+`Size_Format` always uses 2 bits.
+
+- `Size_Format` == 00 : _A single stream_.
                Both `Regenerated_Size` and `Compressed_Size` use 10 bits (0-1023).
-               `Literals_Section_Header` has 3 bytes.
-- Value 01 : 4 streams.
+               `Literals_Section_Header` uses 3 bytes.
+- `Size_Format` == 01 : 4 streams.
                Both `Regenerated_Size` and `Compressed_Size` use 10 bits (0-1023).
-               `Literals_Section_Header` has 3 bytes.
-- Value 10 : 4 streams.
+               `Literals_Section_Header` uses 3 bytes.
+- `Size_Format` == 10 : 4 streams.
                Both `Regenerated_Size` and `Compressed_Size` use 14 bits (0-16383).
-               `Literals_Section_Header` has 4 bytes.
-- Value 11 : 4 streams.
+               `Literals_Section_Header` uses 4 bytes.
+- `Size_Format` == 11 : 4 streams.
                Both `Regenerated_Size` and `Compressed_Size` use 18 bits (0-262143).
-               `Literals_Section_Header` has 5 bytes.
+               `Literals_Section_Header` uses 5 bytes.
 
 Both `Compressed_Size` and `Regenerated_Size` fields follow __little-endian__ convention.
 Note: `Compressed_Size` __includes__ the size of the Huffman Tree description
 _when_ it is present.
 
-### Raw Literals Block
+#### Raw Literals Block
 The data in Stream1 is `Regenerated_Size` bytes long,
 it contains the raw literals data to be used during [Sequence Execution].
 
-### RLE Literals Block
+#### RLE Literals Block
 Stream1 consists of a single byte which should be repeated `Regenerated_Size` times
 to generate the decoded literals.
 
-### Compressed Literals Block and Treeless Literals Block
+#### Compressed Literals Block and Treeless Literals Block
 Both of these modes contain Huffman encoded data.
-`Treeless_Literals_Block` does not have a `Huffman_Tree_Description`.
 
-#### `Huffman_Tree_Description`
+For `Treeless_Literals_Block`,
+the Huffman table comes from previously compressed literals block,
+or from a dictionary.
+
+
+### `Huffman_Tree_Description`
 This section is only present when `Literals_Block_Type` type is `Compressed_Literals_Block` (`2`).
 The format of the Huffman tree description can be found at [Huffman Tree description](#huffman-tree-description).
 The size of `Huffman_Tree_Description` is determined during decoding process,
 it must be used to determine where streams begin.
 `Total_Streams_Size = Compressed_Size - Huffman_Tree_Description_Size`.
 
-For `Treeless_Literals_Block`,
-the Huffman table comes from previously compressed literals block.
 
-Huffman compressed data consists of either 1 or 4 Huffman-coded streams.
+### Jump Table
+The Jump Table is only present when there are 4 Huffman-coded streams.
+
+Reminder : Huffman compressed data consists of either 1 or 4 Huffman-coded streams.
 
 If only one stream is present, it is a single bitstream occupying the entire
 remaining portion of the literals block, encoded as described within
 [Huffman-Coded Streams](#huffman-coded-streams).
 
-If there are four streams, the literals section header only provides enough
-information to know the decompressed and compressed sizes of all four streams _combined_.
-The decompressed size of each stream is equal to `(Regenerated_Size+3)/4`,
+If there are four streams, `Literals_Section_Header` only provided
+enough information to know the decompressed and compressed sizes
+of all four streams _combined_.
+The decompressed size of _each_ stream is equal to `(Regenerated_Size+3)/4`,
 except for the last stream which may be up to 3 bytes smaller,
 to reach a total decompressed size as specified in `Regenerated_Size`.
 
-The compressed size of each stream is provided explicitly:
-the first 6 bytes of the compressed data consist of three 2-byte __little-endian__ fields,
+The compressed size of each stream is provided explicitly in the Jump Table.
+Jump Table is 6 bytes long, and consist of three 2-byte __little-endian__ fields,
 describing the compressed sizes of the first three streams.
 `Stream4_Size` is computed from total `Total_Streams_Size` minus sizes of other streams.
 
 `Stream4_Size = Total_Streams_Size - 6 - Stream1_Size - Stream2_Size - Stream3_Size`.
 
-Note: remember that `Total_Streams_Size` can be smaller than `Compressed_Size` in header,
-because `Compressed_Size` also contains `Huffman_Tree_Description_Size` when it is present.
+Note: if `Stream1_Size + Stream2_Size + Stream3_Size > Total_Streams_Size`,
+data is considered corrupted.
 
 Each of these 4 bitstreams is then decoded independently as a Huffman-Coded stream,
 as described at [Huffman-Coded Streams](#huffman-coded-streams)
@@ -553,10 +591,10 @@ It is the number of bytes to be copied (or extracted) from the Literals Section.
 A match copy command specifies an offset and a length.
 
 When all _sequences_ are decoded,
-if there are literals left in the _literal section_,
+if there are literals left in the _literals section_,
 these bytes are added at the end of the block.
 
-This is described in more detail in [Sequence Execution](#sequence-execution)
+This is described in more detail in [Sequence Execution](#sequence-execution).
 
 The `Sequences_Section` regroup all symbols required to decode commands.
 There are 3 symbol types : literals lengths, offsets and match lengths.
@@ -570,7 +608,8 @@ followed by the bitstream.
 | -------------------------- | ------------------------- | ---------------- | ---------------------- | --------- |
 
 To decode the `Sequences_Section`, it's required to know its size.
-This size is deduced from `Block_Size - Literals_Section_Size`.
+Its size is deduced from the size of `Literals_Section`:
+`Sequences_Section_Size = Block_Size - Literals_Section_Size`.
 
 
 #### `Sequences_Section_Header`
@@ -586,6 +625,7 @@ Let's call its first byte `byte0`.
 - `if (byte0 == 0)` : there are no sequences.
             The sequence section stops there.
             Decompressed content is defined entirely as Literals Section content.
+            The FSE tables used in `Repeat_Mode` aren't updated.
 - `if (byte0 < 128)` : `Number_of_Sequences = byte0` . Uses 1 byte.
 - `if (byte0 < 255)` : `Number_of_Sequences = ((byte0-128) << 8) + byte1` . Uses 2 bytes.
 - `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00` . Uses 3 bytes.
@@ -612,18 +652,22 @@ They follow the same enumeration :
 - `Predefined_Mode` : A predefined FSE distribution table is used, defined in
           [default distributions](#default-distributions).
           No distribution table will be present.
-- `RLE_Mode` : The table description consists of a single byte.
-          This code will be repeated for all sequences.
-- `Repeat_Mode` : The table used in the previous compressed block will be used again.
-          No distribution table will be present.
-          Note: this includes RLE mode, so if `Repeat_Mode` follows `RLE_Mode`, the same symbol will be repeated.
-          If this mode is used without any previous sequence table in the frame
-          (or [dictionary](#dictionary-format)) to repeat, this should be treated as corruption.
+- `RLE_Mode` : The table description consists of a single byte, which contains the symbol's value.
+          This symbol will be used for all sequences.
 - `FSE_Compressed_Mode` : standard FSE compression.
           A distribution table will be present.
           The format of this distribution table is described in [FSE Table Description](#fse-table-description).
           Note that the maximum allowed accuracy log for literals length and match length tables is 9,
           and the maximum accuracy log for the offsets table is 8.
+          `FSE_Compressed_Mode` must not be used when only one symbol is present,
+          `RLE_Mode` should be used instead (although any other mode will work).
+- `Repeat_Mode` : The table used in the previous `Compressed_Block` with `Number_of_Sequences > 0` will be used again,
+          or if this is the first block, table in the dictionary will be used.
+          Note that this includes `RLE_mode`, so if `Repeat_Mode` follows `RLE_Mode`, the same symbol will be repeated.
+          It also includes `Predefined_Mode`, in which case `Repeat_Mode` will have same outcome as `Predefined_Mode`.
+          No distribution table will be present.
+          If this mode is used without any previous sequence table in the frame
+          (nor [dictionary](#dictionary-format)) to repeat, this should be treated as corruption.
 
 #### The codes for literals lengths, match lengths, and offsets.
 
@@ -696,7 +740,7 @@ Offset codes are values ranging from `0` to `N`.
 A decoder is free to limit its maximum `N` supported.
 Recommendation is to support at least up to `22`.
 For information, at the time of this writing.
-the reference decoder supports a maximum `N` value of `28` in 64-bits mode.
+the reference decoder supports a maximum `N` value of `31`.
 
 An offset code is also the number of additional bits to read in __little-endian__ fashion,
 and can be translated into an `Offset_Value` using the following formulas :
@@ -705,7 +749,8 @@ and can be translated into an `Offset_Value` using the following formulas :
 Offset_Value = (1 << offsetCode) + readNBits(offsetCode);
 if (Offset_Value > 3) offset = Offset_Value - 3;
 ```
-It means that maximum `Offset_Value` is `(2^(N+1))-1` and it supports back-reference distance up to `(2^(N+1))-4`
+It means that maximum `Offset_Value` is `(2^(N+1))-1`
+supporting back-reference distances up to `(2^(N+1))-4`,
 but is limited by [maximum back-reference distance](#window_descriptor).
 
 `Offset_Value` from 1 to 3 are special : they define "repeat codes".
@@ -760,7 +805,7 @@ one and ending with the first.
 
 ##### Decoding a sequence
 For each of the symbol types, the FSE state can be used to determine the appropriate code.
-The code then defines the baseline and number of bits to read for each type.
+The code then defines the `Baseline` and `Number_of_Bits` to read for each type.
 See the [description of the codes] for how to determine these values.
 
 [description of the codes]: #the-codes-for-literals-lengths-match-lengths-and-offsets
@@ -827,8 +872,8 @@ they are combined to produce the decoded content of a block.
 
 Each sequence consists of a tuple of (`literals_length`, `offset_value`, `match_length`),
 decoded as described in the [Sequences Section](#sequences-section).
-To execute a sequence, first copy `literals_length` bytes from the literals section
-to the output.
+To execute a sequence, first copy `literals_length` bytes
+from the decoded literals to the output.
 
 Then `match_length` bytes are copied from previous decoded data.
 The offset to copy from is determined by `offset_value`:
@@ -856,7 +901,9 @@ so an `offset_value` of 1 means `Repeated_Offset2`,
 an `offset_value` of 2 means `Repeated_Offset3`,
 and an `offset_value` of 3 means `Repeated_Offset1 - 1_byte`.
 
-For the first block, the starting offset history is populated with the following values : 1, 4 and 8 (in order).
+For the first block, the starting offset history is populated with following values :
+`Repeated_Offset1`=1, `Repeated_Offset2`=4, `Repeated_Offset3`=8,
+unless a dictionary is used, in which case they come from the dictionary.
 
 Then each block gets its starting offset history from the ending values of the most recent `Compressed_Block`.
 Note that blocks which are not `Compressed_Block` are skipped, they do not contribute to offset history.
@@ -880,21 +927,28 @@ Skippable Frames
 |:--------------:|:------------:|:-----------:|
 |   4 bytes      |  4 bytes     |   n bytes   |
 
-Skippable frames allow the insertion of user-defined data
+Skippable frames allow the insertion of user-defined metadata
 into a flow of concatenated frames.
-Its design is pretty straightforward,
-with the sole objective to allow the decoder to quickly skip
-over user-defined data and continue decoding.
 
 Skippable frames defined in this specification are compatible with [LZ4] ones.
 
 [LZ4]:http://www.lz4.org
 
+From a compliant decoder perspective, skippable frames need just be skipped,
+and their content ignored, resuming decoding after the skippable frame.
+
+It can be noted that a skippable frame
+can be used to watermark a stream of concatenated frames
+embedding any kind of tracking information (even just an UUID).
+Users wary of such possibility should scan the stream of concatenated frames
+in an attempt to detect such frame for analysis or removal.
+
 __`Magic_Number`__
 
 4 Bytes, __little-endian__ format.
 Value : 0x184D2A5?, which means any value from 0x184D2A50 to 0x184D2A5F.
 All 16 values are valid to identify a skippable frame.
+This specification doesn't detail any specific tagging for skippable frames.
 
 __`Frame_Size`__
 
@@ -908,10 +962,16 @@ __`User_Data`__
 The `User_Data` can be anything. Data will just be skipped by the decoder.
 
 
+
 Entropy Encoding
 ----------------
 Two types of entropy encoding are used by the Zstandard format:
 FSE, and Huffman coding.
+Huffman is used to compress literals,
+while FSE is used for all other symbols
+(`Literals_Length_Code`, `Match_Length_Code`, offset codes)
+and to compress Huffman headers.
+
 
 FSE
 ---
@@ -919,6 +979,8 @@ FSE, short for Finite State Entropy, is an entropy codec based on [ANS].
 FSE encoding/decoding involves a state that is carried over between symbols,
 so decoding must be done in the opposite direction as encoding.
 Therefore, all FSE bitstreams are read from end to beginning.
+Note that the order of the bits in the stream is not reversed,
+we just read the elements in the reverse order they are written.
 
 For additional details on FSE, see [Finite State Entropy].
 
@@ -927,7 +989,7 @@ For additional details on FSE, see [Finite State Entropy].
 FSE decoding involves a decoding table which has a power of 2 size, and contain three elements:
 `Symbol`, `Num_Bits`, and `Baseline`.
 The `log2` of the table size is its `Accuracy_Log`.
-The FSE state represents an index in this table.
+An FSE state value represents an index in this table.
 
 To obtain the initial state value, consume `Accuracy_Log` bits from the stream as a __little-endian__ value.
 The next symbol in the stream is the `Symbol` indicated in the table for that state.
@@ -943,12 +1005,14 @@ The Zstandard format encodes FSE table descriptions as follows:
 An FSE distribution table describes the probabilities of all symbols
 from `0` to the last present one (included)
 on a normalized scale of `1 << Accuracy_Log` .
+Note that there must be two or more symbols with nonzero probability.
 
 It's a bitstream which is read forward, in __little-endian__ fashion.
-It's not necessary to know its exact size,
-since it will be discovered and reported by the decoding process.
+It's not necessary to know bitstream exact size,
+it will be discovered and reported by the decoding process.
 
 The bitstream starts by reporting on which scale it operates.
+Let's `low4Bits` designate the lowest 4 bits of the first byte :
 `Accuracy_Log = low4bits + 5`.
 
 Then follows each symbol value, from `0` to last present one.
@@ -959,24 +1023,24 @@ It depends on :
   __example__ :
   Presuming an `Accuracy_Log` of 8,
   and presuming 100 probabilities points have already been distributed,
-  the decoder may read any value from `0` to `255 - 100 + 1 == 156` (inclusive).
-  Therefore, it must read `log2sup(156) == 8` bits.
+  the decoder may read any value from `0` to `256 - 100 + 1 == 157` (inclusive).
+  Therefore, it must read `log2sup(157) == 8` bits.
 
 - Value decoded : small values use 1 less bit :
   __example__ :
-  Presuming values from 0 to 156 (inclusive) are possible,
-  255-156 = 99 values are remaining in an 8-bits field.
+  Presuming values from 0 to 157 (inclusive) are possible,
+  255-157 = 98 values are remaining in an 8-bits field.
   They are used this way :
-  first 99 values (hence from 0 to 98) use only 7 bits,
-  values from 99 to 156 use 8 bits.
+  first 98 values (hence from 0 to 97) use only 7 bits,
+  values from 98 to 157 use 8 bits.
   This is achieved through this scheme :
 
   | Value read | Value decoded | Number of bits used |
   | ---------- | ------------- | ------------------- |
-  |   0 -  98  |   0 -  98     |  7                  |
-  |  99 - 127  |  99 - 127     |  8                  |
-  | 128 - 226  |   0 -  98     |  7                  |
-  | 227 - 255  | 128 - 156     |  8                  |
+  |   0 -  97  |   0 -  97     |  7                  |
+  |  98 - 127  |  98 - 127     |  8                  |
+  | 128 - 225  |   0 -  97     |  7                  |
+  | 226 - 255  | 128 - 157     |  8                  |
 
 Symbols probabilities are read one by one, in order.
 
@@ -1006,7 +1070,7 @@ and how many symbols are present.
 The bitstream consumes a round number of bytes.
 Any remaining bit within the last byte is just unused.
 
-##### From normalized distribution to decoding tables
+#### From normalized distribution to decoding tables
 
 The distribution of normalized probabilities is enough
 to create a unique decoding table.
@@ -1019,12 +1083,12 @@ and instructions to get the next state.
 
 Symbols are scanned in their natural order for "less than 1" probabilities.
 Symbols with this probability are being attributed a single cell,
-starting from the end of the table.
+starting from the end of the table and retreating.
 These symbols define a full state reset, reading `Accuracy_Log` bits.
 
-All remaining symbols are sorted in their natural order.
+All remaining symbols are allocated in their natural order.
 Starting from symbol `0` and table position `0`,
-each symbol gets attributed as many cells as its probability.
+each symbol gets allocated as many cells as its probability.
 Cell allocation is spreaded, not linear :
 each successor position follow this rule :
 
@@ -1044,6 +1108,7 @@ Each state will decode the current symbol.
 To get the `Number_of_Bits` and `Baseline` required for next state,
 it's first necessary to sort all states in their natural order.
 The lower states will need 1 more bit than higher ones.
+The process is repeated for each symbol.
 
 __Example__ :
 Presuming a symbol has a probability of 5.
@@ -1055,10 +1120,12 @@ Presuming the `Accuracy_Log` is 7, it defines 128 states.
 Divided by 8, each share is 16 large.
 
 In order to reach 8, 8-5=3 lowest states will count "double",
-taking shares twice larger,
+doubling the number of shares (32 in width),
 requiring one more bit in the process.
 
-Numbering starts from higher states using less bits.
+Baseline is assigned starting from the higher states using fewer bits,
+and proceeding naturally, then resuming at the first state,
+each takes its allocated width from Baseline.
 
 | state order      |   0   |   1   |    2   |   3  |   4   |
 | ---------------- | ----- | ----- | ------ | ---- | ----- |
@@ -1075,6 +1142,7 @@ See [Appendix A] for the results of this process applied to the default distribu
 
 [Appendix A]: #appendix-a---decoding-tables-for-predefined-codes
 
+
 Huffman Coding
 --------------
 Zstandard Huffman-coded streams are read backwards,
@@ -1096,6 +1164,7 @@ The bitstream contains Huffman-coded symbols in __little-endian__ order,
 with the codes defined by the method below.
 
 ### Huffman Tree Description
+
 Prefix coding represents symbols from an a priori known alphabet
 by bit sequences (codewords), one codeword for each symbol,
 in a manner such that different symbols may be represented
@@ -1112,8 +1181,7 @@ More bits improve accuracy but cost more header size,
 and require more memory or more complex decoding operations.
 This specification limits maximum code length to 11 bits.
 
-
-##### Representation
+#### Representation
 
 All literal values from zero (included) to last present one (excluded)
 are represented by `Weight` with values from `0` to `Max_Number_of_Bits`.
@@ -1124,16 +1192,19 @@ Number_of_Bits = Weight ? (Max_Number_of_Bits + 1 - Weight) : 0
 The last symbol's `Weight` is deduced from previously decoded ones,
 by completing to the nearest power of 2.
 This power of 2 gives `Max_Number_of_Bits`, the depth of the current tree.
+`Max_Number_of_Bits` must be <= 11,
+otherwise the representation is considered corrupted.
 
 __Example__ :
 Let's presume the following Huffman tree must be described :
 
-|     literal      |  0  |  1  |  2  |  3  |  4  |  5  |
+|  literal value   |  0  |  1  |  2  |  3  |  4  |  5  |
 | ---------------- | --- | --- | --- | --- | --- | --- |
 | `Number_of_Bits` |  1  |  2  |  3  |  0  |  4  |  4  |
 
-The tree depth is 4, since its smallest element uses 4 bits.
-Value `5` will not be listed as it can be determined from the values for 0-4,
+The tree depth is 4, since its longest elements uses 4 bits
+(longest elements are the one with smallest frequency).
+Value `5` will not be listed, as it can be determined from values for 0-4,
 nor will values above `5` as they are all 0.
 Values from `0` to `4` will be listed using `Weight` instead of `Number_of_Bits`.
 Weight formula is :
@@ -1142,41 +1213,49 @@ Weight = Number_of_Bits ? (Max_Number_of_Bits + 1 - Number_of_Bits) : 0
 ```
 It gives the following series of weights :
 
-| literal  |  0  |  1  |  2  |  3  |  4  |
-| -------- | --- | --- | --- | --- | --- |
-| `Weight` |  4  |  3  |  2  |  0  |  1  |
+| literal value |  0  |  1  |  2  |  3  |  4  |
+| ------------- | --- | --- | --- | --- | --- |
+|   `Weight`    |  4  |  3  |  2  |  0  |  1  |
 
 The decoder will do the inverse operation :
-having collected weights of literals from `0` to `4`,
-it knows the last literal, `5`, is present with a non-zero weight.
-The weight of `5` can be determined by advancing to the next power of 2.
+having collected weights of literal symbols from `0` to `4`,
+it knows the last literal, `5`, is present with a non-zero `Weight`.
+The `Weight` of `5` can be determined by advancing to the next power of 2.
 The sum of `2^(Weight-1)` (excluding 0's) is :
 `8 + 4 + 2 + 0 + 1 = 15`.
-Nearest power of 2 is 16.
-Therefore, `Max_Number_of_Bits = 4` and `Weight[5] = 1`.
+Nearest larger power of 2 value is 16.
+Therefore, `Max_Number_of_Bits = 4` and `Weight[5] = 16-15 = 1`.
 
-##### Huffman Tree header
+#### Huffman Tree header
 
 This is a single byte value (0-255),
-which describes how to decode the list of weights.
-
-- if `headerByte` >= 128 : this is a direct representation,
-  where each `Weight` is written directly as a 4 bits field (0-15).
-  They are encoded forward, 2 weights to a byte with the first weight taking
-  the top four bits and the second taking the bottom four (e.g. the following
-  operations could be used to read the weights:
-  `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.).
-  The full representation occupies `((Number_of_Symbols+1)/2)` bytes,
-  meaning it uses a last full byte even if `Number_of_Symbols` is odd.
-  `Number_of_Symbols = headerByte - 127`.
-  Note that maximum `Number_of_Symbols` is 255-127 = 128.
-  A larger series must necessarily use FSE compression.
+which describes how the series of weights is encoded.
 
 - if `headerByte` < 128 :
-  the series of weights is compressed by FSE.
+  the series of weights is compressed using FSE (see below).
   The length of the FSE-compressed series is equal to `headerByte` (0-127).
 
-##### Finite State Entropy (FSE) compression of Huffman weights
+- if `headerByte` >= 128 :
+  + the series of weights uses a direct representation,
+    where each `Weight` is encoded directly as a 4 bits field (0-15).
+  + They are encoded forward, 2 weights to a byte,
+    first weight taking the top four bits and second one taking the bottom four.
+    * e.g. the following operations could be used to read the weights:
+      `Weight[0] = (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf)`, etc.
+  + The full representation occupies `Ceiling(Number_of_Weights/2)` bytes,
+    meaning it uses only full bytes even if `Number_of_Weights` is odd.
+  + `Number_of_Weights = headerByte - 127`.
+    * Note that maximum `Number_of_Weights` is 255-127 = 128,
+      therefore, only up to 128 `Weight` can be encoded using direct representation.
+    * Since the last non-zero `Weight` is _not_ encoded,
+      this scheme is compatible with alphabet sizes of up to 129 symbols,
+      hence including literal symbol 128.
+    * If any literal symbol > 128 has a non-zero `Weight`,
+      direct representation is not possible.
+      In such case, it's necessary to use FSE compression.
+
+
+#### Finite State Entropy (FSE) compression of Huffman weights
 
 In this case, the series of Huffman weights is compressed using FSE compression.
 It's a single bitstream with 2 interleaved states,
@@ -1186,17 +1265,17 @@ To decode an FSE bitstream, it is necessary to know its compressed size.
 Compressed size is provided by `headerByte`.
 It's also necessary to know its _maximum possible_ decompressed size,
 which is `255`, since literal values span from `0` to `255`,
-and last symbol's weight is not represented.
+and last symbol's `Weight` is not represented.
 
 An FSE bitstream starts by a header, describing probabilities distribution.
 It will create a Decoding Table.
-For a list of Huffman weights, the maximum accuracy log is 7 bits.
+For a list of Huffman weights, the maximum accuracy log is 6 bits.
 For more description see the [FSE header description](#fse-table-description)
 
 The Huffman header compression uses 2 states,
 which share the same FSE distribution table.
 The first state (`State1`) encodes the even indexed symbols,
-and the second (`State2`) encodes the odd indexes.
+and the second (`State2`) encodes the odd indexed symbols.
 `State1` is initialized first, and then `State2`, and they take turns
 decoding a single symbol and updating their state.
 For more details on these FSE operations, see the [FSE section](#fse).
@@ -1205,18 +1284,19 @@ The number of symbols to decode is determined
 by tracking bitStream overflow condition:
 If updating state after decoding a symbol would require more bits than
 remain in the stream, it is assumed that extra bits are 0.  Then,
-the symbols for each of the final states are decoded and the process is complete.
+symbols for each of the final states are decoded and the process is complete.
 
-##### Conversion from weights to Huffman prefix codes
+#### Conversion from weights to Huffman prefix codes
 
 All present symbols shall now have a `Weight` value.
-It is possible to transform weights into Number_of_Bits, using this formula:
+It is possible to transform weights into `Number_of_Bits`, using this formula:
 ```
-Number_of_Bits = Number_of_Bits ? Max_Number_of_Bits + 1 - Weight : 0
+Number_of_Bits = (Weight>0) ? Max_Number_of_Bits + 1 - Weight : 0
 ```
-Symbols are sorted by `Weight`. Within same `Weight`, symbols keep natural order.
+Symbols are sorted by `Weight`.
+Within same `Weight`, symbols keep natural sequential order.
 Symbols with a `Weight` of zero are removed.
-Then, starting from lowest weight, prefix codes are distributed in order.
+Then, starting from lowest `Weight`, prefix codes are distributed in sequential order.
 
 __Example__ :
 Let's presume the following list of weights has been decoded :
@@ -1225,7 +1305,7 @@ Let's presume the following list of weights has been decoded :
 | -------- | --- | --- | --- | --- | --- | --- |
 | `Weight` |  4  |  3  |  2  |  0  |  1  |  1  |
 
-Sorted by weight and then natural order,
+Sorted by weight and then natural sequential order,
 it gives the following distribution :
 
 | Literal          |  3  |  4  |  5  |  2  |  1  |   0  |
@@ -1235,6 +1315,7 @@ it gives the following distribution :
 | prefix codes     | N/A | 0000| 0001| 001 | 01  |   1  |
 
 ### Huffman-coded Streams
+
 Given a Huffman decoding table,
 it's possible to decode a Huffman-coded stream.
 
@@ -1242,7 +1323,7 @@ Each bitstream must be read _backward_,
 that is starting from the end down to the beginning.
 Therefore it's necessary to know the size of each bitstream.
 
-It's also necessary to know exactly which _bit_ is the latest.
+It's also necessary to know exactly which _bit_ is the last one.
 This is detected by a final bit flag :
 the highest bit of latest byte is a final-bit-flag.
 Consequently, a last byte of `0` is not possible.
@@ -1312,7 +1393,7 @@ _Reserved ranges :_
               - low range  : <= 32767
               - high range : >= (2^31)
 
-__`Entropy_Tables`__ : following the same format as the tables in compressed blocks.
+__`Entropy_Tables`__ : follow the same format as tables in [compressed blocks].
               See the relevant [FSE](#fse-table-description)
               and [Huffman](#huffman-tree-description) sections for how to decode these tables.
               They are stored in following order :
@@ -1330,11 +1411,16 @@ __`Content`__ : The rest of the dictionary is its content.
               As long as the amount of data decoded from this frame is less than or
               equal to `Window_Size`, sequence commands may specify offsets longer
               than the total length of decoded output so far to reference back to the
-              dictionary.  After the total output has surpassed `Window_Size` however,
+              dictionary, even parts of the dictionary with offsets larger than `Window_Size`.  
+              After the total output has surpassed `Window_Size` however,
               this is no longer allowed and the dictionary is no longer accessible.
 
 [compressed blocks]: #the-format-of-compressed_block
 
+If a dictionary is provided by an external source,
+it should be loaded with great care, its content considered untrusted.
+
+
 
 Appendix A - Decoding tables for predefined codes
 -------------------------------------------------
@@ -1521,8 +1607,32 @@ to crosscheck that an implementation build its decoding tables correctly.
 |    30 |     25 |              5 |    0 |
 |    31 |     24 |              5 |    0 |
 
+
+
+Appendix B - Resources for implementers
+-------------------------------------------------
+
+An open source reference implementation is available on :
+https://github.com/facebook/zstd
+
+The project contains a frame generator, called [decodeCorpus],
+which can be used by any 3rd-party implementation
+to verify that a tested decoder is compliant with the specification.
+
+[decodeCorpus]: https://github.com/facebook/zstd/tree/v1.3.4/tests#decodecorpus---tool-to-generate-zstandard-frames-for-decoder-testing
+
+`decodeCorpus` generates random valid frames.
+A compliant decoder should be able to decode them all,
+or at least provide a meaningful error code explaining for which reason it cannot
+(memory limit restrictions for example).
+
+
 Version changes
 ---------------
+- 0.3.0 : minor edits to match RFC8478
+- 0.2.9 : clarifications for huffman weights direct representation, by Ulrich Kunitz
+- 0.2.8 : clarifications for IETF RFC discuss
+- 0.2.7 : clarifications from IETF RFC review, by Vijay Gurbani and Nick Terrell
 - 0.2.6 : fixed an error in huffman example, by Ulrich Kunitz
 - 0.2.5 : minor typos and clarifications
 - 0.2.4 : section restructuring, by Sean Purcell
diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index 623fd611d167..f9b1daa8a28c 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -1,24 +1,24 @@
 <html>
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<title>zstd 1.3.4 Manual</title>
+<title>zstd 1.3.7 Manual</title>
 </head>
 <body>
-<h1>zstd 1.3.4 Manual</h1>
+<h1>zstd 1.3.7 Manual</h1>
 <hr>
 <a name="Contents"></a><h2>Contents</h2>
 <ol>
 <li><a href="#Chapter1">Introduction</a></li>
 <li><a href="#Chapter2">Version</a></li>
-<li><a href="#Chapter3">Simple API</a></li>
-<li><a href="#Chapter4">Explicit context</a></li>
-<li><a href="#Chapter5">Simple dictionary API</a></li>
-<li><a href="#Chapter6">Bulk processing dictionary API</a></li>
-<li><a href="#Chapter7">Streaming</a></li>
-<li><a href="#Chapter8">Streaming compression - HowTo</a></li>
-<li><a href="#Chapter9">Streaming decompression - HowTo</a></li>
-<li><a href="#Chapter10">START OF ADVANCED AND EXPERIMENTAL FUNCTIONS</a></li>
-<li><a href="#Chapter11">Advanced types</a></li>
+<li><a href="#Chapter3">Default constant</a></li>
+<li><a href="#Chapter4">Simple API</a></li>
+<li><a href="#Chapter5">Explicit context</a></li>
+<li><a href="#Chapter6">Simple dictionary API</a></li>
+<li><a href="#Chapter7">Bulk processing dictionary API</a></li>
+<li><a href="#Chapter8">Streaming</a></li>
+<li><a href="#Chapter9">Streaming compression - HowTo</a></li>
+<li><a href="#Chapter10">Streaming decompression - HowTo</a></li>
+<li><a href="#Chapter11">ADVANCED AND EXPERIMENTAL FUNCTIONS</a></li>
 <li><a href="#Chapter12">Frame size functions</a></li>
 <li><a href="#Chapter13">Memory management</a></li>
 <li><a href="#Chapter14">Advanced compression functions</a></li>
@@ -32,29 +32,43 @@
 </ol>
 <hr>
 <a name="Chapter1"></a><h2>Introduction</h2><pre>
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
 <BR></pre>
 
 <a name="Chapter2"></a><h2>Version</h2><pre></pre>
 
 <pre><b>unsigned ZSTD_versionNumber(void);   </b>/**< useful to check dll version */<b>
 </b></pre><BR>
-<a name="Chapter3"></a><h2>Simple API</h2><pre></pre>
+<a name="Chapter3"></a><h2>Default constant</h2><pre></pre>
+
+<a name="Chapter4"></a><h2>Simple API</h2><pre></pre>
 
 <pre><b>size_t ZSTD_compress( void* dst, size_t dstCapacity,
                 const void* src, size_t srcSize,
@@ -80,7 +94,7 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
 </b><p>  `src` should point to the start of a ZSTD encoded frame.
   `srcSize` must be at least as large as the frame header.
             hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
-  @return : - decompressed size of the frame in `src`, if known
+  @return : - decompressed size of `src` frame content, if known
             - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
             - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
    note 1 : a 0 return value means the frame is valid but "empty".
@@ -90,7 +104,8 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
             Optionally, application can rely on some implicit limit,
             as ZSTD_decompress() only needs an upper bound of decompressed size.
             (For example, data could be necessarily cut into blocks <= 16 KB).
-   note 3 : decompressed size is always present when compression is done with ZSTD_compress()
+   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
    note 4 : decompressed size can be very large (64-bits value),
             potentially larger than what local system can handle as a single memory segment.
             In which case, it's necessary to use streaming mode to decompress data.
@@ -105,8 +120,7 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
   Both functions work the same way, but ZSTD_getDecompressedSize() blends
   "empty", "unknown" and "error" results to the same return value (0),
   while ZSTD_getFrameContentSize() gives them separate return values.
- `src` is the start of a zstd compressed frame.
- @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. 
+ @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. 
 </p></pre><BR>
 
 <h3>Helper functions</h3><pre></pre><b><pre>#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) </b>/* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */<b>
@@ -115,7 +129,7 @@ unsigned    ZSTD_isError(size_t code);          </b>/*!< tells if a `size_t` fun
 const char* ZSTD_getErrorName(size_t code);     </b>/*!< provides readable string from an error code */<b>
 int         ZSTD_maxCLevel(void);               </b>/*!< maximum compression level available */<b>
 </pre></b><BR>
-<a name="Chapter4"></a><h2>Explicit context</h2><pre></pre>
+<a name="Chapter5"></a><h2>Explicit context</h2><pre></pre>
 
 <h3>Compression context</h3><pre>  When compressing many times,
   it is recommended to allocate a context just once, and re-use it for each successive compression operation.
@@ -147,7 +161,7 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </b><p>  Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()) 
 </p></pre><BR>
 
-<a name="Chapter5"></a><h2>Simple dictionary API</h2><pre></pre>
+<a name="Chapter6"></a><h2>Simple dictionary API</h2><pre></pre>
 
 <pre><b>size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
                                void* dst, size_t dstCapacity,
@@ -169,14 +183,15 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
   Note : When `dict == NULL || dictSize < 8` no dictionary is used. 
 </p></pre><BR>
 
-<a name="Chapter6"></a><h2>Bulk processing dictionary API</h2><pre></pre>
+<a name="Chapter7"></a><h2>Bulk processing dictionary API</h2><pre></pre>
 
 <pre><b>ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
                              int compressionLevel);
 </b><p>  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
   ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
   ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
-  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict 
+  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. 
 </p></pre><BR>
 
 <pre><b>size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
@@ -190,7 +205,9 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </b><p>  Compression using a digested Dictionary.
   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
   Note that compression level is decided during dictionary creation.
-  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) 
+  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). 
 </p></pre><BR>
 
 <pre><b>ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
@@ -210,7 +227,7 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
   Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. 
 </p></pre><BR>
 
-<a name="Chapter7"></a><h2>Streaming</h2><pre></pre>
+<a name="Chapter8"></a><h2>Streaming</h2><pre></pre>
 
 <pre><b>typedef struct ZSTD_inBuffer_s {
   const void* src;    </b>/**< start of input buffer */<b>
@@ -224,7 +241,7 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
   size_t pos;         </b>/**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */<b>
 } ZSTD_outBuffer;
 </b></pre><BR>
-<a name="Chapter8"></a><h2>Streaming compression - HowTo</h2><pre>
+<a name="Chapter9"></a><h2>Streaming compression - HowTo</h2><pre>
   A ZSTD_CStream object is required to track streaming operation.
   Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
   ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
@@ -232,33 +249,38 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
   since it will play nicer with system's memory, by re-using already allocated memory.
   Use one separate ZSTD_CStream per thread for parallel execution.
 
-  Start a new compression by initializing ZSTD_CStream.
+  Start a new compression by initializing ZSTD_CStream context.
   Use ZSTD_initCStream() to start a new compression operation.
-  Use ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for a compression which requires a dictionary (experimental section)
+  Use variants ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for streaming with dictionary (experimental section)
 
-  Use ZSTD_compressStream() repetitively to consume input stream.
-  The function will automatically update both `pos` fields.
-  Note that it may not consume the entire input, in which case `pos < size`,
-  and it's up to the caller to present again remaining data.
+  Use ZSTD_compressStream() as many times as necessary to consume input stream.
+  The function will automatically update both `pos` fields within `input` and `output`.
+  Note that the function may not consume the entire input,
+  for example, because the output buffer is already full,
+  in which case `input.pos < input.size`.
+  The caller must check if input has been entirely consumed.
+  If not, the caller must make some room to receive more compressed data,
+  typically by emptying output buffer, or allocating a new output buffer,
+  and then present again remaining input data.
   @return : a size hint, preferred nb of bytes to use as input for next function call
             or an error code, which can be tested using ZSTD_isError().
             Note 1 : it's just a hint, to help latency a little, any other value will work fine.
             Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
 
-  At any moment, it's possible to flush whatever data remains within internal buffer, using ZSTD_flushStream().
-  `output->pos` will be updated.
-  Note that some content might still be left within internal buffer if `output->size` is too small.
-  @return : nb of bytes still present within internal buffer (0 if it's empty)
+  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+  using ZSTD_flushStream(). `output->pos` will be updated.
+  Note that, if `output->size` is too small, a single invocation of ZSTD_flushStream() might not be enough (return code > 0).
+  In which case, make some room to receive more compressed data, and call again ZSTD_flushStream().
+  @return : 0 if internal buffers are entirely flushed,
+            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
             or an error code, which can be tested using ZSTD_isError().
 
   ZSTD_endStream() instructs to finish a frame.
   It will perform a flush and write frame epilogue.
   The epilogue is required for decoders to consider a frame completed.
-  ZSTD_endStream() may not be able to flush full data if `output->size` is too small.
-  In which case, call again ZSTD_endStream() to complete the flush.
+  flush() operation is the same, and follows same rules as ZSTD_flushStream().
   @return : 0 if frame fully completed and fully flushed,
-             or >0 if some data is still present within internal buffer
-                  (value is minimum size estimation for remaining data to flush, but it could be more)
+            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
             or an error code, which can be tested using ZSTD_isError().
 
  
@@ -278,7 +300,7 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
 </b></pre><BR>
 <pre><b>size_t ZSTD_CStreamOutSize(void);   </b>/**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block in all circumstances. */<b>
 </b></pre><BR>
-<a name="Chapter9"></a><h2>Streaming decompression - HowTo</h2><pre>
+<a name="Chapter10"></a><h2>Streaming decompression - HowTo</h2><pre>
   A ZSTD_DStream object is required to track streaming operations.
   Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
   ZSTD_DStream objects can be re-used multiple times.
@@ -291,11 +313,17 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
   The function will update both `pos` fields.
   If `input.pos < input.size`, some input has not been consumed.
   It's up to the caller to present again remaining data.
+  The function tries to flush all data decoded immediately, repecting buffer sizes.
   If `output.pos < output.size`, decoder has flushed everything it could.
-  @return : 0 when a frame is completely decoded and fully flushed,
-            an error code, which can be tested using ZSTD_isError(),
-            any other value > 0, which means there is still some decoding to do to complete current frame.
-            The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame.
+  But if `output.pos == output.size`, there is no such guarantee,
+  it's likely that some decoded data was not flushed and still remains within internal buffers.
+  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+  When no additional input is provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+ @return : 0 when a frame is completely decoded and fully flushed,
+        or an error code, which can be tested using ZSTD_isError(),
+        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+                                the return value is a suggested next input size (a hint for better latency)
+                                that will never load more than the current frame.
  
 <BR></pre>
 
@@ -311,15 +339,16 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
 </b></pre><BR>
 <pre><b>size_t ZSTD_DStreamOutSize(void);   </b>/*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */<b>
 </b></pre><BR>
-<a name="Chapter10"></a><h2>START OF ADVANCED AND EXPERIMENTAL FUNCTIONS</h2><pre> The definitions in this section are considered experimental.
+<a name="Chapter11"></a><h2>ADVANCED AND EXPERIMENTAL FUNCTIONS</h2><pre>
+ The definitions in this section are considered experimental.
  They should never be used with a dynamic library, as prototypes may change in the future.
  They are provided for advanced scenarios.
  Use them only in association with static linking.
  
 <BR></pre>
 
-<a name="Chapter11"></a><h2>Advanced types</h2><pre></pre>
-
+<pre><b>int ZSTD_minCLevel(void);  </b>/*!< minimum negative compression level allowed */<b>
+</b></pre><BR>
 <pre><b>typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
                ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   </b>/* from faster to stronger */<b>
 </b></pre><BR>
@@ -389,9 +418,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
 </p></pre><BR>
 
 <pre><b>size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
-</b><p>   `src` should point to the start of a ZSTD frame
-   `srcSize` must be >= ZSTD_frameHeaderSize_prefix.
-   @return : size of the Frame Header 
+</b><p>  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ @return : size of the Frame Header,
+           or an error code (if srcSize is too small) 
 </p></pre><BR>
 
 <a name="Chapter13"></a><h2>Memory management</h2><pre></pre>
@@ -577,21 +606,40 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict*
 </pre></b><BR>
 <pre><b>size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 </b><p>  start a new compression job, using same parameters from previous job.
-  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
   Note that zcs must be init at least once before using ZSTD_resetCStream().
   If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
   If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
   For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
   but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
- @return : 0, or an error code (which can be tested using ZSTD_isError()) 
+ @return : 0, or an error code (which can be tested using ZSTD_isError())
+ 
 </p></pre><BR>
 
 <pre><b>typedef struct {
-    unsigned long long ingested;
-    unsigned long long consumed;
-    unsigned long long produced;
+    unsigned long long ingested;   </b>/* nb input bytes read and buffered */<b>
+    unsigned long long consumed;   </b>/* nb input bytes actually compressed */<b>
+    unsigned long long produced;   </b>/* nb of compressed bytes generated and buffered */<b>
+    unsigned long long flushed;    </b>/* nb of compressed bytes flushed : not provided; can be tracked from caller side */<b>
+    unsigned currentJobID;         </b>/* MT only : latest started job nb */<b>
+    unsigned nbActiveWorkers;      </b>/* MT only : nb of workers actively compressing at probe time */<b>
 } ZSTD_frameProgression;
 </b></pre><BR>
+<pre><b>size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+</b><p>  Tell how many bytes are ready to be flushed immediately.
+  Useful for multithreading scenarios (nbWorkers >= 1).
+  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+  and check its output buffer.
+ @return : amount of data stored in oldest job and ready to be flushed immediately.
+  if @return == 0, it means either :
+  + there is no active job (could be checked with ZSTD_frameProgression()), or
+  + oldest job is still actively compressing data,
+    but everything it has produced has also been flushed so far,
+    therefore flushing speed is currently limited by production speed of oldest job
+    irrespective of the speed of concurrent newer jobs.
+ 
+</p></pre><BR>
+
 <h3>Advanced Streaming decompression functions</h3><pre></pre><b><pre>typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
 size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);   </b>/* obsolete : this API will be removed in a future version */<b>
 size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); </b>/**< note: no dictionary will be used if dict == NULL or dictSize < 8 */<b>
@@ -722,6 +770,11 @@ typedef struct {
     unsigned dictID;
     unsigned checksumFlag;
 } ZSTD_frameHeader;
+</b>/** ZSTD_getFrameHeader() :<b>
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   </b>/**< doesn't consume input */<b>
 size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  </b>/**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */<b>
 </pre></b><BR>
@@ -753,7 +806,7 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
     </b>/* compression parameters */<b>
     ZSTD_p_compressionLevel=100, </b>/* Update all compression parameters according to pre-defined cLevel table<b>
                               * Default level is ZSTD_CLEVEL_DEFAULT==3.
-                              * Special: value 0 means "do not change cLevel".
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
                               * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
                               * Note 2 : setting a level sets all default values of other compression parameters.
                               * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
@@ -762,16 +815,19 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
                               * Special: value 0 means "use default windowLog".
                               * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
                               *       requires explicitly allowing such window size during decompression stage. */
-    ZSTD_p_hashLog,          </b>/* Size of the probe table, as a power of 2.<b>
+    ZSTD_p_hashLog,          </b>/* Size of the initial probe table, as a power of 2.<b>
                               * Resulting table size is (1 << (hashLog+2)).
                               * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
                               * Larger tables improve compression ratio of strategies <= dFast,
                               * and improve speed of strategies > dFast.
                               * Special: value 0 means "use default hashLog". */
-    ZSTD_p_chainLog,         </b>/* Size of the full-search table, as a power of 2.<b>
+    ZSTD_p_chainLog,         </b>/* Size of the multi-probe search table, as a power of 2.<b>
                               * Resulting table size is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
                               * Larger tables result in better and slower compression.
                               * This parameter is useless when using "fast" strategy.
+                              * Note it's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
                               * Special: value 0 means "use default chainLog". */
     ZSTD_p_searchLog,        </b>/* Number of search attempts, as a power of 2.<b>
                               * More attempts result in better and slower compression.
@@ -853,26 +909,51 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
     </b>/* experimental parameters - no stability guaranteed                   */<b>
     </b>/* =================================================================== */<b>
 
-    ZSTD_p_compressLiterals=1000, </b>/* control huffman compression of literals (enabled) by default.<b>
-                              * disabling it improves speed and decreases compression ratio by a large amount.
-                              * note : this setting is automatically updated when changing compression level.
-                              *        positive compression levels set ZSTD_p_compressLiterals to 1.
-                              *        negative compression levels set ZSTD_p_compressLiterals to 0. */
-
     ZSTD_p_forceMaxWindow=1100, </b>/* Force back-reference distances to remain < windowSize,<b>
                               * even when referencing into Dictionary content (default:0) */
+    ZSTD_p_forceAttachDict,  </b>/* ZSTD supports usage of a CDict in-place<b>
+                              * (avoiding having to copy the compression tables
+                              * from the CDict into the working context). Using
+                              * a CDict in this way saves an initial setup step,
+                              * but comes at the cost of more work per byte of
+                              * input. ZSTD has a simple internal heuristic that
+                              * guesses which strategy will be faster. You can
+                              * use this flag to override that guess.
+                              *
+                              * Note that the by-reference, in-place strategy is
+                              * only used when reusing a compression context
+                              * with compatible compression parameters. (If
+                              * incompatible / uninitialized, the working
+                              * context needs to be cleared anyways, which is
+                              * about as expensive as overwriting it with the
+                              * dictionary context, so there's no savings in
+                              * using the CDict by-ref.)
+                              *
+                              * Values greater than 0 force attaching the dict.
+                              * Values less than 0 force copying the dict.
+                              * 0 selects the default heuristic-guided behavior.
+                              */
 
 } ZSTD_cParameter;
 </b></pre><BR>
 <pre><b>size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
 </b><p>  Set one compression parameter, selected by enum ZSTD_cParameter.
-  Setting a parameter is generally only possible during frame initialization (before starting compression),
-  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
-  Note : when `value` is an enum, cast it to unsigned for proper type checking.
-  @result : informational value (typically, value being set clamped correctly),
+  Setting a parameter is generally only possible during frame initialization (before starting compression).
+  Exception : when using multi-threading mode (nbThreads >= 1),
+              following parameters can be updated _during_ compression (within same frame):
+              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+              new parameters will be active on next job, or after a flush().
+  Note : when `value` type is not unsigned (int, or enum), cast it to unsigned for proper type checking.
+  @result : informational value (typically, value being set, correctly clamped),
             or an error code (which can be tested with ZSTD_isError()). 
 </p></pre><BR>
 
+<pre><b>size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value);
+</b><p> Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ 
+</p></pre><BR>
+
 <pre><b>size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
 </b><p>  Total input data size to be compressed as a single frame.
   This value will be controlled at the end, and result in error if not respected.
@@ -916,19 +997,27 @@ size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size
   Note 2 : CDict is just referenced, its lifetime must outlive CCtx. 
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
-size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+<pre><b>size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                           const void* prefix, size_t prefixSize);
+size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx,
+                           const void* prefix, size_t prefixSize,
+                           ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
-  Decompression need same prefix to properly regenerate data.
-  Prefix is **only used once**. Tables are discarded at end of compression job.
-  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
-  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
+  Decompression will need same prefix to properly regenerate data.
+  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
   Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
-  Note 1 : Prefix buffer is referenced. It must outlive compression job.
-  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
+           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
+  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+           ensure that the window size is large enough to contain the entire source.
+           See ZSTD_p_windowLog.
+  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
            It's a CPU consuming operation, with non-negligible impact on latency.
-  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+           If there is a need to use same prefix multiple times, consider loadDictionary instead.
+  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
            Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. 
 </p></pre><BR>
 
@@ -936,16 +1025,27 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
 </b><p>  Return a CCtx to clean state.
   Useful after an error, or to interrupt an ongoing compression job and start a new one.
   Any internal data not yet flushed is cancelled.
+  The parameters and dictionary are kept unchanged, to reset them use ZSTD_CCtx_resetParameters().
+ 
+</p></pre><BR>
+
+<pre><b>size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx);
+</b><p>  All parameters are back to default values (compression level is ZSTD_CLEVEL_DEFAULT).
   Dictionary (if any) is dropped.
-  All parameters are back to default values.
-  It's possible to modify compression parameters after a reset.
+  Resetting parameters is only possible during frame initialization (before starting compression).
+  To reset the context use ZSTD_CCtx_reset().
+  @return 0 or an error code (which can be checked with ZSTD_isError()).
  
 </p></pre><BR>
 
 <pre><b>typedef enum {
-    ZSTD_e_continue=0, </b>/* collect more data, encoder decides when to output compressed result, for optimal conditions */<b>
-    ZSTD_e_flush,      </b>/* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */<b>
-    ZSTD_e_end         </b>/* flush any remaining data and close current frame. Any additional data starts a new frame. */<b>
+    ZSTD_e_continue=0, </b>/* collect more data, encoder decides when to output compressed result, for optimal compression ratio */<b>
+    ZSTD_e_flush,      </b>/* flush any data provided so far,<b>
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression. */
+    ZSTD_e_end         </b>/* flush any remaining data and close current frame.<b>
+                        * any additional data starts a new frame.
+                        * each frame is independent (does not reference any content from previous frame). */
 } ZSTD_EndDirective;
 </b></pre><BR>
 <pre><b>size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
@@ -1033,6 +1133,13 @@ size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
  
 </p></pre><BR>
 
+<pre><b>size_t ZSTD_CCtxParam_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned* value);
+</b><p> Similar to ZSTD_CCtx_getParameter.
+ Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ 
+</p></pre><BR>
+
 <pre><b>size_t ZSTD_CCtx_setParametersUsingCCtxParams(
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
 </b><p>  Apply a set of ZSTD_CCtx_params to the compression context.
@@ -1043,7 +1150,8 @@ size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
  
 </p></pre><BR>
 
-<h3>Advanced parameters for decompression API</h3><pre></pre><b><pre></pre></b><BR>
+<h3>Advanced decompression API</h3><pre></pre><b><pre></b>/* ==================================== */<b>
+</pre></b><BR>
 <pre><b>size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
 size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
 size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
@@ -1074,17 +1182,25 @@ size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size
  
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);
-size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+<pre><b>size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                        const void* prefix, size_t prefixSize);
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx,
+                        const void* prefix, size_t prefixSize,
+                        ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
-  Prefix is **only used once**. It must be explicitly referenced before each frame.
-  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_DDict instead.
+  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+  and must use the same prefix as the one used during compression.
+  Prefix is **only used once**. Reference is discarded at end of frame.
+  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  @result : 0, or an error code (which can be tested with ZSTD_isError()).
   Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
-  Note 2 : Prefix buffer is referenced. It must outlive compression job.
+  Note 2 : Prefix buffer is referenced. It **must** outlive decompression job.
+           Prefix buffer must remain unmodified up to the end of frame,
+           reached when ZSTD_DCtx_decompress_generic() returns 0.
   Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
            Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode.
   Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+           A fulldict prefix is more costly though.
  
 </p></pre><BR>
 
@@ -1105,6 +1221,12 @@ size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t
  
 </p></pre><BR>
 
+<pre><b>size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr,
+            const void* src, size_t srcSize, ZSTD_format_e format);
+</b><p>  same as ZSTD_getFrameHeader(),
+  with added capability to select a format (like ZSTD_f_zstd1_magicless) 
+</p></pre><BR>
+
 <pre><b>size_t ZSTD_decompress_generic(ZSTD_DCtx* dctx,
                                ZSTD_outBuffer* output,
                                ZSTD_inBuffer* input);
diff --git a/lib/BUCK b/lib/BUCK
index dbe8885fc4cf..bd93b082a347 100644
--- a/lib/BUCK
+++ b/lib/BUCK
@@ -69,6 +69,7 @@ cxx_library(
     ]),
     headers=subdir_glob([
         ('dictBuilder', 'divsufsort.h'),
+        ('dictBuilder', 'cover.h'),
     ]),
     srcs=glob(['dictBuilder/*.c']),
     deps=[':common'],
diff --git a/lib/Makefile b/lib/Makefile
index cdfdc5cdfd46..9711f75eea87 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -19,23 +19,62 @@ LIBVER := $(shell echo $(LIBVER_SCRIPT))
 VERSION?= $(LIBVER)
 
 CPPFLAGS+= -I. -I./common -DXXH_NAMESPACE=ZSTD_
+ifeq ($(OS),Windows_NT)   # MinGW assumed
+CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
+endif
 CFLAGS  ?= -O3
-DEBUGFLAGS = -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-            -Wredundant-decls
+            -Wredundant-decls -Wmissing-prototypes
 CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
 FLAGS    = $(CPPFLAGS) $(CFLAGS)
 
+GREP = grep --color=never
 
-ZSTD_FILES := $(sort $(wildcard common/*.c compress/*.c decompress/*.c dictBuilder/*.c deprecated/*.c))
+ZSTDCOMMON_FILES := $(sort $(wildcard common/*.c))
+ZSTDCOMP_FILES := $(sort $(wildcard compress/*.c))
+ZSTDDECOMP_FILES := $(sort $(wildcard decompress/*.c))
+ZDICT_FILES := $(sort $(wildcard dictBuilder/*.c))
+ZDEPR_FILES := $(sort $(wildcard deprecated/*.c))
+ZSTD_FILES := $(ZSTDCOMMON_FILES)
 
-ZSTD_LEGACY_SUPPORT ?= 4
+ZSTD_LEGACY_SUPPORT ?= 5
+ZSTD_LIB_COMPRESSION ?= 1
+ZSTD_LIB_DECOMPRESSION ?= 1
+ZSTD_LIB_DICTBUILDER ?= 1
+ZSTD_LIB_DEPRECATED ?= 1
+
+ifeq ($(ZSTD_LIB_COMPRESSION), 0)
+	ZSTD_LIB_DICTBUILDER = 0
+	ZSTD_LIB_DEPRECATED = 0
+endif
+
+ifeq ($(ZSTD_LIB_DECOMPRESSION), 0)
+	ZSTD_LEGACY_SUPPORT = 0
+	ZSTD_LIB_DEPRECATED = 0
+endif
+
+ifneq ($(ZSTD_LIB_COMPRESSION), 0)
+	ZSTD_FILES += $(ZSTDCOMP_FILES)
+endif
+
+ifneq ($(ZSTD_LIB_DECOMPRESSION), 0)
+	ZSTD_FILES += $(ZSTDDECOMP_FILES)
+endif
+
+ifneq ($(ZSTD_LIB_DEPRECATED), 0)
+	ZSTD_FILES += $(ZDEPR_FILES)
+endif
+
+ifneq ($(ZSTD_LIB_DICTBUILDER), 0)
+	ZSTD_FILES += $(ZDICT_FILES)
+endif
 
 ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
 ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
-	ZSTD_FILES += $(shell ls legacy/*.c | grep 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
+	ZSTD_FILES += $(shell ls legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
 endif
 	CPPFLAGS += -I./legacy
 endif
@@ -43,7 +82,7 @@ CPPFLAGS  += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 
 ZSTD_OBJ   := $(patsubst %.c,%.o,$(ZSTD_FILES))
 
-# OS X linker doesn't support -soname, and use different extension
+# macOS linker doesn't support -soname, and use different extension
 # see : https://developer.apple.com/library/mac/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html
 ifeq ($(shell uname), Darwin)
 	SHARED_EXT = dylib
@@ -57,8 +96,6 @@ else
 	SHARED_EXT_VER = $(SHARED_EXT).$(LIBVER)
 endif
 
-LIBZSTD = libzstd.$(SHARED_EXT_VER)
-
 
 .PHONY: default all clean install uninstall
 
@@ -74,19 +111,28 @@ libzstd.a: $(ZSTD_OBJ)
 libzstd.a-mt: CPPFLAGS += -DZSTD_MULTITHREAD
 libzstd.a-mt: libzstd.a
 
+ifneq (,$(filter Windows%,$(OS)))
+
+LIBZSTD = dll\libzstd.dll
+$(LIBZSTD): $(ZSTD_FILES)
+	@echo compiling dynamic library $(LIBVER)
+	@$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -shared $^ -o $@
+	dlltool -D $@ -d dll\libzstd.def -l dll\libzstd.lib
+
+else
+
+LIBZSTD = libzstd.$(SHARED_EXT_VER)
 $(LIBZSTD): LDFLAGS += -shared -fPIC -fvisibility=hidden
 $(LIBZSTD): $(ZSTD_FILES)
 	@echo compiling dynamic library $(LIBVER)
-ifneq (,$(filter Windows%,$(OS)))
-	@$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -shared $^ -o dll\libzstd.dll
-	dlltool -D dll\libzstd.dll -d dll\libzstd.def -l dll\libzstd.lib
-else
 	@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
 	@echo creating versioned links
 	@ln -sf $@ libzstd.$(SHARED_EXT_MAJOR)
 	@ln -sf $@ libzstd.$(SHARED_EXT)
+
 endif
 
+
 libzstd : $(LIBZSTD)
 
 libzstd-mt : CPPFLAGS += -DZSTD_MULTITHREAD
@@ -111,16 +157,16 @@ libzstd-nomt: $(ZSTD_NOMT_FILES)
 	@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
 
 clean:
-	@$(RM) -r *.dSYM   # Mac OS-X specific
+	@$(RM) -r *.dSYM   # macOS-specific
 	@$(RM) core *.o *.a *.gcda *.$(SHARED_EXT) *.$(SHARED_EXT).* libzstd.pc
 	@$(RM) dll/libzstd.dll dll/libzstd.lib libzstd-nomt*
 	@$(RM) common/*.o compress/*.o decompress/*.o dictBuilder/*.o legacy/*.o deprecated/*.o
 	@echo Cleaning library completed
 
 #-----------------------------------------------------------------------------
-# make install is validated only for Linux, OSX, BSD, Hurd and Solaris targets
+# make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #-----------------------------------------------------------------------------
-ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
+ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku))
 
 DESTDIR     ?=
 # directory variables : GNU conventions prefer lowercase
@@ -134,7 +180,7 @@ LIBDIR      ?= $(libdir)
 includedir  ?= $(PREFIX)/include
 INCLUDEDIR  ?= $(includedir)
 
-ifneq (,$(filter $(shell uname),OpenBSD FreeBSD NetBSD DragonFly))
+ifneq (,$(filter $(shell uname),FreeBSD NetBSD DragonFly))
 PKGCONFIGDIR ?= $(PREFIX)/libdata/pkgconfig
 else
 PKGCONFIGDIR ?= $(LIBDIR)/pkgconfig
@@ -159,20 +205,32 @@ libzstd.pc: libzstd.pc.in
              -e 's|@VERSION@|$(VERSION)|' \
              $< >$@
 
-install: libzstd.a libzstd libzstd.pc
-	@$(INSTALL) -d -m 755 $(DESTDIR)$(PKGCONFIGDIR)/ $(DESTDIR)$(INCLUDEDIR)/
+install: install-pc install-static install-shared install-includes
+	@echo zstd static and shared library installed
+
+install-pc: libzstd.pc
+	@$(INSTALL) -d -m 755 $(DESTDIR)$(PKGCONFIGDIR)/
 	@$(INSTALL_DATA) libzstd.pc $(DESTDIR)$(PKGCONFIGDIR)/
-	@echo Installing libraries
+
+install-static: libzstd.a
+	@echo Installing static library
+	@$(INSTALL) -d -m 755 $(DESTDIR)$(LIBDIR)/
 	@$(INSTALL_DATA) libzstd.a $(DESTDIR)$(LIBDIR)
+
+install-shared: libzstd
+	@echo Installing shared library
+	@$(INSTALL) -d -m 755 $(DESTDIR)$(LIBDIR)/
 	@$(INSTALL_PROGRAM) $(LIBZSTD) $(DESTDIR)$(LIBDIR)
 	@ln -sf $(LIBZSTD) $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT_MAJOR)
 	@ln -sf $(LIBZSTD) $(DESTDIR)$(LIBDIR)/libzstd.$(SHARED_EXT)
+
+install-includes:
 	@echo Installing includes
+	@$(INSTALL) -d -m 755 $(DESTDIR)$(INCLUDEDIR)/
 	@$(INSTALL_DATA) zstd.h $(DESTDIR)$(INCLUDEDIR)
 	@$(INSTALL_DATA) common/zstd_errors.h $(DESTDIR)$(INCLUDEDIR)
 	@$(INSTALL_DATA) deprecated/zbuff.h $(DESTDIR)$(INCLUDEDIR)     # prototypes generate deprecation warnings
 	@$(INSTALL_DATA) dictBuilder/zdict.h $(DESTDIR)$(INCLUDEDIR)
-	@echo zstd static and shared library installed
 
 uninstall:
 	@$(RM) $(DESTDIR)$(LIBDIR)/libzstd.a
diff --git a/lib/README.md b/lib/README.md
index 95196e46784c..0966c7aef497 100644
--- a/lib/README.md
+++ b/lib/README.md
@@ -13,7 +13,7 @@ including commands variables, staged install, directory variables and standard t
 - `make install` : install libraries in default system directories
 
 `libzstd` default scope includes compression, decompression, dictionary building,
-and decoding support for legacy formats >= v0.4.0.
+and decoding support for legacy formats >= v0.5.0.
 
 
 #### API
@@ -48,19 +48,24 @@ It's possible to compile only a limited set of features.
         This module depends on both `lib/common` and `lib/compress` .
 - `lib/legacy` : source code to decompress legacy zstd formats, starting from `v0.1.0`.
         This module depends on `lib/common` and `lib/decompress`.
-        To enable this feature, it's required to define `ZSTD_LEGACY_SUPPORT` during compilation.
-        Typically, with `gcc`, add argument `-DZSTD_LEGACY_SUPPORT=1`.
-        Using higher number limits versions supported.
+        To enable this feature, define `ZSTD_LEGACY_SUPPORT` during compilation.
+        Specifying a number limits versions supported to that version onward.
         For example, `ZSTD_LEGACY_SUPPORT=2` means : "support legacy formats >= v0.2.0".
         `ZSTD_LEGACY_SUPPORT=3` means : "support legacy formats >= v0.3.0", and so on.
-        Starting v0.8.0, all versions of `zstd` produce frames compliant with specification.
-        As a consequence, `ZSTD_LEGACY_SUPPORT=8` (or more) doesn't trigger legacy support.
-        Also, `ZSTD_LEGACY_SUPPORT=0` means "do __not__ support legacy formats".
+        Currently, the default library setting is `ZST_LEGACY_SUPPORT=5`.
+        It can be changed at build by any other value.
+        Note that any number >= 8 translates into "do __not__ support legacy formats",
+        since all versions of `zstd` >= v0.8 are compatible with v1+ specification.
+        `ZSTD_LEGACY_SUPPORT=0` also means "do __not__ support legacy formats".
         Once enabled, this capability is transparently triggered within decompression functions.
         It's also possible to invoke directly legacy API, as exposed in `lib/legacy/zstd_legacy.h`.
         Each version also provides an additional dedicated set of advanced API.
         For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
         Note : `lib/legacy` only supports _decoding_ legacy formats.
+- Similarly, you can define `ZSTD_LIB_COMPRESSION, ZSTD_LIB_DECOMPRESSION`, `ZSTD_LIB_DICTBUILDER`,
+        and `ZSTD_LIB_DEPRECATED` as 0 to forgo compilation of the corresponding features. This will
+        also disable compilation of all dependencies (eg. `ZSTD_LIB_COMPRESSION=0` will also disable
+        dictBuilder).
 
 
 #### Multithreading support
diff --git a/lib/common/bitstream.h b/lib/common/bitstream.h
index f7f389fe0fa8..ef89b9878e22 100644
--- a/lib/common/bitstream.h
+++ b/lib/common/bitstream.h
@@ -1,8 +1,7 @@
 /* ******************************************************************
    bitstream
    Part of FSE library
-   header file (to include)
-   Copyright (C) 2013-2017, Yann Collet.
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -49,21 +48,10 @@ extern "C" {
 *  Dependencies
 ******************************************/
 #include "mem.h"            /* unaligned access routines */
+#include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */
 
 
-/*-*************************************
-*  Debug
-***************************************/
-#if defined(BIT_DEBUG) && (BIT_DEBUG>=1)
-#  include <assert.h>
-#else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
-#endif
-
-
 /*=========================================
 *  Target specific
 =========================================*/
@@ -83,8 +71,7 @@ extern "C" {
  * A critical property of these streams is that they encode and decode in **reverse** direction.
  * So the first bit sequence you add will be the last to be read, like a LIFO stack.
  */
-typedef struct
-{
+typedef struct {
     size_t bitContainer;
     unsigned bitPos;
     char*  startPtr;
@@ -118,8 +105,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
 /*-********************************************
 *  bitStream decoding API (read backward)
 **********************************************/
-typedef struct
-{
+typedef struct {
     size_t   bitContainer;
     unsigned bitsConsumed;
     const char* ptr;
@@ -236,7 +222,8 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
 }
 
 /*! BIT_addBitsFast() :
- *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
 MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
                                 size_t value, unsigned nbBits)
 {
@@ -352,17 +339,10 @@ MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
 
 MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
 {
-#if defined(__BMI__) && defined(__GNUC__) && __GNUC__*1000+__GNUC_MINOR__ >= 4008  /* experimental */
-#  if defined(__x86_64__)
-    if (sizeof(bitContainer)==8)
-        return _bextr_u64(bitContainer, start, nbBits);
-    else
-#  endif
-        return _bextr_u32(bitContainer, start, nbBits);
-#else
+    U32 const regMask = sizeof(bitContainer)*8 - 1;
+    /* if start > regMask, bitstream is corrupted, and result is undefined */
     assert(nbBits < BIT_MASK_SIZE);
-    return (bitContainer >> start) & BIT_mask[nbBits];
-#endif
+    return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
 }
 
 MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
@@ -379,9 +359,13 @@ MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
  * @return : value extracted */
 MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
 {
-#if defined(__BMI__) && defined(__GNUC__)   /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
+    /* arbitrate between double-shift and shift+mask */
+#if 1
+    /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
+     * bitstream is likely corrupted, and result is undefined */
     return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
 #else
+    /* this code path is slower on my os-x laptop */
     U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
     return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
 #endif
diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index e90a3bcde36c..07f875e4d38e 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -77,9 +77,9 @@
  * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
  */
 #ifndef DYNAMIC_BMI2
-  #if (defined(__clang__) && __has_attribute(__target__)) \
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
       || (defined(__GNUC__) \
-          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
       && (defined(__x86_64__) || defined(_M_X86)) \
       && !defined(__BMI2__)
   #  define DYNAMIC_BMI2 1
@@ -88,15 +88,37 @@
   #endif
 #endif
 
-/* prefetch */
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
-#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#  define PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
-#elif defined(__GNUC__)
-#  define PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH macro
+ * All prefetch invocations use a single default locality 2,
+ * generating instruction prefetcht1,
+ * which, according to Intel, means "load data into L2 cache".
+ * This is a good enough "middle ground" for the time being,
+ * though in theory, it would be better to specialize locality depending on data being prefetched.
+ * Tests could not determine any sensible difference based on locality value. */
+#if defined(NO_PREFETCH)
+#  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
-#  define PREFETCH(ptr)   /* disabled */
-#endif
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  else
+#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH(_ptr + _pos);            \
+    }                                     \
+}
 
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
diff --git a/lib/common/cpu.h b/lib/common/cpu.h
index 4eb48e39e10e..eeb428ad5f6a 100644
--- a/lib/common/cpu.h
+++ b/lib/common/cpu.h
@@ -36,7 +36,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
     U32 f1d = 0;
     U32 f7b = 0;
     U32 f7c = 0;
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
     int reg[4];
     __cpuid((int*)reg, 0);
     {
@@ -72,8 +72,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
           "cpuid\n\t"
           "popl %%ebx\n\t"
           : "=a"(f1a), "=c"(f1c), "=d"(f1d)
-          : "a"(1)
-          :);
+          : "a"(1));
     }
     if (n >= 7) {
       __asm__(
diff --git a/lib/common/debug.c b/lib/common/debug.c
new file mode 100644
index 000000000000..3ebdd1cb15a6
--- /dev/null
+++ b/lib/common/debug.c
@@ -0,0 +1,44 @@
+/* ******************************************************************
+   debug
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+#include "debug.h"
+
+int g_debuglevel = DEBUGLEVEL;
diff --git a/lib/common/debug.h b/lib/common/debug.h
new file mode 100644
index 000000000000..0c04ad2cc98c
--- /dev/null
+++ b/lib/common/debug.h
@@ -0,0 +1,123 @@
+/* ******************************************************************
+   debug
+   Part of FSE library
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact,
+ * but can only work with compile-time constants.
+ * This variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : no debug, all run-time functions disabled
+ * 1 : no display, enables assert() only
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively enable higher verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>
+#else
+#  ifndef assert   /* assert may be already defined, due to prior #include <assert.h> */
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+#  include <stdio.h>
+extern int g_debuglevel; /* here, this variable is only declared,
+                           it actually lives in debug.c,
+                           and is shared by the whole process.
+                           It's typically used to enable very verbose levels
+                           on selective conditions (such as position in src) */
+
+#  define RAWLOG(l, ...) {                                      \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
diff --git a/lib/common/entropy_common.c b/lib/common/entropy_common.c
index b37a082fee2c..b12944e1de93 100644
--- a/lib/common/entropy_common.c
+++ b/lib/common/entropy_common.c
@@ -72,7 +72,21 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
     unsigned charnum = 0;
     int previous0 = 0;
 
-    if (hbSize < 4) return ERROR(srcSize_wrong);
+    if (hbSize < 4) {
+        /* This function only works when hbSize >= 4 */
+        char buffer[4];
+        memset(buffer, 0, sizeof(buffer));
+        memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 4);
+
+    /* init */
+    memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
     bitStream = MEM_readLE32(ip);
     nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
     if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
@@ -105,6 +119,7 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
             if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
             while (charnum < n0) normalizedCounter[charnum++] = 0;
             if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
                 ip += bitCount>>3;
                 bitCount &= 7;
                 bitStream = MEM_readLE32(ip) >> bitCount;
diff --git a/lib/common/fse.h b/lib/common/fse.h
index 6a1d272be5cb..a5a6b6d4db70 100644
--- a/lib/common/fse.h
+++ b/lib/common/fse.h
@@ -72,6 +72,7 @@ extern "C" {
 #define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
 FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
 
+
 /*-****************************************
 *  FSE simple functions
 ******************************************/
@@ -129,7 +130,7 @@ FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src,
 ******************************************/
 /*!
 FSE_compress() does the following:
-1. count symbol occurrence from source[] into table count[]
+1. count symbol occurrence from source[] into table count[] (see hist.h)
 2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
 3. save normalized counters to memory buffer using writeNCount()
 4. build encoding table 'CTable' from normalized counters
@@ -147,15 +148,6 @@ or to save and provide normalized distribution using external method.
 
 /* *** COMPRESSION *** */
 
-/*! FSE_count():
-    Provides the precise count of each byte within a table 'count'.
-    'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
-    *maxSymbolValuePtr will be updated if detected smaller than initial value.
-    @return : the count of the most frequent symbol (which is not identified).
-              if return == srcSize, there is only one symbol.
-              Can also return an error code, which can be tested with FSE_isError(). */
-FSE_PUBLIC_API size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
-
 /*! FSE_optimalTableLog():
     dynamically downsize 'tableLog' when conditions are met.
     It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
@@ -167,7 +159,8 @@ FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize
     'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
     @return : tableLog,
               or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
 
 /*! FSE_NCountWriteBound():
     Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
@@ -178,8 +171,9 @@ FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tab
     Compactly save 'normalizedCounter' into 'buffer'.
     @return : size of the compressed table,
               or an errorCode, which can be tested using FSE_isError(). */
-FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
 
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
@@ -250,7 +244,9 @@ If there is an error, the function will return an ErrorCode (which can be tested
     @return : size read from 'rBuffer',
               or an errorCode, which can be tested using FSE_isError().
               maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
-FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
 
 /*! Constructor and Destructor of FSE_DTable.
     Note that its size depends on 'tableLog' */
@@ -325,33 +321,8 @@ If there is an error, the function will return an error code, which can be teste
 
 
 /* *****************************************
-*  FSE advanced API
-*******************************************/
-/* FSE_count_wksp() :
- * Same as FSE_count(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= `1024` unsigned
- */
-size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                 const void* source, size_t sourceSize, unsigned* workSpace);
-
-/** FSE_countFast() :
- *  same as FSE_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr
- */
-size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
-
-/* FSE_countFast_wksp() :
- * Same as FSE_countFast(), but using an externally provided scratch buffer.
- * `workSpace` must be a table of minimum `1024` unsigned
- */
-size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace);
-
-/*! FSE_count_simple() :
- * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
- * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
-*/
-size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
-
-
+ *  FSE advanced API
+ ***************************************** */
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /**< same as FSE_optimalTableLog(), which used `minus==2` */
@@ -576,6 +547,39 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
 }
 
 
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
+}
+
+
 /* ======    Decompression    ====== */
 
 typedef struct {
diff --git a/lib/common/fse_decompress.c b/lib/common/fse_decompress.c
index 4c66c3b77464..72bbead5beea 100644
--- a/lib/common/fse_decompress.c
+++ b/lib/common/fse_decompress.c
@@ -49,7 +49,7 @@
 *  Error Management
 ****************************************************************/
 #define FSE_isError ERR_isError
-#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
 
 /* check and forward error code */
 #define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; }
diff --git a/lib/common/huf.h b/lib/common/huf.h
index b4645b4e5197..de9464111064 100644
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@@ -1,7 +1,7 @@
 /* ******************************************************************
-   Huffman coder, part of New Generation Entropy library
-   header file
-   Copyright (C) 2013-2016, Yann Collet.
+   huff0 huffman codec,
+   part of Finite State Entropy library
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -163,25 +163,25 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
 /* static allocation of HUF's DTable */
 typedef U32 HUF_DTable;
 #define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
-#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
         HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
-#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
         HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
 
 
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
-size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
 
 size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
 size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
 size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
-size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
-size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
 
 
 /* ****************************************
@@ -208,7 +208,7 @@ size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, si
 typedef enum {
    HUF_repeat_none,  /**< Cannot use the previous table */
    HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
-   HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
  } HUF_repeat;
 /** HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
@@ -227,7 +227,9 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
  */
 #define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
-size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize);
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const U32* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
 
 /*! HUF_readStats() :
  *  Read compact Huffman tree, saved by HUF_writeCTable().
@@ -242,10 +244,15 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
  *  Loading a CTable saved with HUF_writeCTable() */
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
 
+/** HUF_getNbBits() :
+ *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ *  Note 1 : is not inlined, as HUF_CElt definition is private
+ *  Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
 
 /*
  * HUF_decompress() does the following:
- * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
  * 2. build Huffman table from save, using HUF_readDTableX?()
  * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
  */
@@ -253,13 +260,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
 /** HUF_selectDecoder() :
  *  Tells which decoder is likely to decode faster,
  *  based on a set of pre-computed metrics.
- * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
  *  Assumption : 0 < dstSize <= 128 KB */
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 
 /**
  *  The minimum workspace size for the `workSpace` used in
- *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
+ *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
  *
  *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
  *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
@@ -270,14 +277,14 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 #define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
+size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
 size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX4_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 
 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 
 
 /* ====================== */
@@ -298,25 +305,25 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                        void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
                        HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
 
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
 
 size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
 size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
-size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
-size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
 
 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 
 /* BMI2 variants.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
 size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
-size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
 size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
 size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
 
diff --git a/lib/common/mem.h b/lib/common/mem.h
index 47d2300177c0..5da248756ffd 100644
--- a/lib/common/mem.h
+++ b/lib/common/mem.h
@@ -39,6 +39,10 @@ extern "C" {
 #  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
 #endif
 
+#ifndef __has_builtin
+#  define __has_builtin(x) 0  /* compat. with non-clang compilers */
+#endif
+
 /* code only tested on 32 and 64 bits systems */
 #define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
 MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
@@ -57,11 +61,23 @@ MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (size
   typedef  uint64_t U64;
   typedef   int64_t S64;
 #else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
   typedef unsigned char      BYTE;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
   typedef unsigned short      U16;
   typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
   typedef unsigned int        U32;
   typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
   typedef unsigned long long  U64;
   typedef   signed long long  S64;
 #endif
@@ -186,7 +202,8 @@ MEM_STATIC U32 MEM_swap32(U32 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
     return _byteswap_ulong(in);
-#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap32))
     return __builtin_bswap32(in);
 #else
     return  ((in << 24) & 0xff000000 ) |
@@ -200,7 +217,8 @@ MEM_STATIC U64 MEM_swap64(U64 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
     return _byteswap_uint64(in);
-#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap64))
     return __builtin_bswap64(in);
 #else
     return  ((in << 56) & 0xff00000000000000ULL) |
diff --git a/lib/common/pool.c b/lib/common/pool.c
index 773488b07255..281b3824ac4d 100644
--- a/lib/common/pool.c
+++ b/lib/common/pool.c
@@ -10,9 +10,10 @@
 
 
 /* ======   Dependencies   ======= */
-#include <stddef.h>  /* size_t */
-#include "pool.h"
+#include <stddef.h>    /* size_t */
+#include "debug.h"     /* assert */
 #include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */
+#include "pool.h"
 
 /* ======   Compiler specifics   ====== */
 #if defined(_MSC_VER)
@@ -33,8 +34,9 @@ typedef struct POOL_job_s {
 struct POOL_ctx_s {
     ZSTD_customMem customMem;
     /* Keep track of the threads */
-    ZSTD_pthread_t *threads;
-    size_t numThreads;
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
 
     /* The queue is a circular buffer */
     POOL_job *queue;
@@ -58,10 +60,10 @@ struct POOL_ctx_s {
 };
 
 /* POOL_thread() :
-   Work thread for the thread pool.
-   Waits for jobs and executes them.
-   @returns : NULL on failure else non-null.
-*/
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
 static void* POOL_thread(void* opaque) {
     POOL_ctx* const ctx = (POOL_ctx*)opaque;
     if (!ctx) { return NULL; }
@@ -69,14 +71,17 @@ static void* POOL_thread(void* opaque) {
         /* Lock the mutex and wait for a non-empty queue or until shutdown */
         ZSTD_pthread_mutex_lock(&ctx->queueMutex);
 
-        while (ctx->queueEmpty && !ctx->shutdown) {
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
             ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
         }
-        /* empty => shutting down: so stop */
-        if (ctx->queueEmpty) {
-            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
-            return opaque;
-        }
         /* Pop a job off the queue */
         {   POOL_job const job = ctx->queue[ctx->queueHead];
             ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
@@ -89,30 +94,32 @@ static void* POOL_thread(void* opaque) {
             job.function(job.opaque);
 
             /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
             if (ctx->queueSize == 1) {
-                ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-                ctx->numThreadsBusy--;
-                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
                 ZSTD_pthread_cond_signal(&ctx->queuePushCond);
-        }   }
+            }
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
     }  /* for (;;) */
-    /* Unreachable */
+    assert(0);  /* Unreachable */
 }
 
 POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
     return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
 }
 
-POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem) {
     POOL_ctx* ctx;
-    /* Check the parameters */
+    /* Check parameters */
     if (!numThreads) { return NULL; }
     /* Allocate the context and zero initialize */
     ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
     if (!ctx) { return NULL; }
     /* Initialize the job queue.
-     * It needs one extra space since one space is wasted to differentiate empty
-     * and full queues.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
      */
     ctx->queueSize = queueSize + 1;
     ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
@@ -126,7 +133,7 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customM
     ctx->shutdown = 0;
     /* Allocate space for the thread handles */
     ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
-    ctx->numThreads = 0;
+    ctx->threadCapacity = 0;
     ctx->customMem = customMem;
     /* Check for errors */
     if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
@@ -134,11 +141,12 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customM
     {   size_t i;
         for (i = 0; i < numThreads; ++i) {
             if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
-                ctx->numThreads = i;
+                ctx->threadCapacity = i;
                 POOL_free(ctx);
                 return NULL;
         }   }
-        ctx->numThreads = numThreads;
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
     }
     return ctx;
 }
@@ -156,8 +164,8 @@ static void POOL_join(POOL_ctx* ctx) {
     ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
     /* Join all of the threads */
     {   size_t i;
-        for (i = 0; i < ctx->numThreads; ++i) {
-            ZSTD_pthread_join(ctx->threads[i], NULL);
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);  /* note : could fail */
     }   }
 }
 
@@ -172,24 +180,68 @@ void POOL_free(POOL_ctx *ctx) {
     ZSTD_free(ctx, ctx->customMem);
 }
 
+
+
 size_t POOL_sizeof(POOL_ctx *ctx) {
     if (ctx==NULL) return 0;  /* supports sizeof NULL */
     return sizeof(*ctx)
         + ctx->queueSize * sizeof(POOL_job)
-        + ctx->numThreads * sizeof(ZSTD_pthread_t);
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+        ZSTD_free(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
 }
 
 /**
  * Returns 1 if the queue is full and 0 otherwise.
  *
- * If the queueSize is 1 (the pool was created with an intended queueSize of 0),
- * then a queue is empty if there is a thread free and no job is waiting.
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
  */
 static int isQueueFull(POOL_ctx const* ctx) {
     if (ctx->queueSize > 1) {
         return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
     } else {
-        return ctx->numThreadsBusy == ctx->numThreads ||
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
                !ctx->queueEmpty;
     }
 }
@@ -263,6 +315,11 @@ void POOL_free(POOL_ctx* ctx) {
     (void)ctx;
 }
 
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
 void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
     (void)ctx;
     function(opaque);
diff --git a/lib/common/pool.h b/lib/common/pool.h
index a57e9b4fabc2..458d37f13c3e 100644
--- a/lib/common/pool.h
+++ b/lib/common/pool.h
@@ -30,40 +30,50 @@ typedef struct POOL_ctx_s POOL_ctx;
 */
 POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
 
-POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem);
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem);
 
 /*! POOL_free() :
-    Free a thread pool returned by POOL_create().
-*/
+ *  Free a thread pool returned by POOL_create().
+ */
 void POOL_free(POOL_ctx* ctx);
 
+/*! POOL_resize() :
+ *  Expands or shrinks pool's number of threads.
+ *  This is more efficient than releasing + creating a new context,
+ *  since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ *           !0 (typically 1) if there is an error.
+ *    note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
 /*! POOL_sizeof() :
-    return memory usage of pool returned by POOL_create().
-*/
+ * @return threadpool memory usage
+ *  note : compatible with NULL (returns 0 in this case)
+ */
 size_t POOL_sizeof(POOL_ctx* ctx);
 
 /*! POOL_function :
-    The function type that can be added to a thread pool.
-*/
+ *  The function type that can be added to a thread pool.
+ */
 typedef void (*POOL_function)(void*);
-/*! POOL_add_function :
-    The function type for a generic thread pool add function.
-*/
-typedef void (*POOL_add_function)(void*, POOL_function, void*);
 
 /*! POOL_add() :
-    Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
-    Possibly blocks until there is room in the queue.
-    Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
-*/
+ *  Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ *  Possibly blocks until there is room in the queue.
+ *  Note : The function may be executed asynchronously,
+ *         therefore, `opaque` must live until function has been completed.
+ */
 void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
 
 
 /*! POOL_tryAdd() :
-    Add the job `function(opaque)` to the thread pool if a worker is available.
-    return immediately otherwise.
-   @return : 1 if successful, 0 if not.
-*/
+ *  Add the job `function(opaque)` to thread pool _if_ a worker is available.
+ *  Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
 int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
 
 
diff --git a/lib/common/xxhash.c b/lib/common/xxhash.c
index 9d9c0e963cbf..532b8161929d 100644
--- a/lib/common/xxhash.c
+++ b/lib/common/xxhash.c
@@ -98,6 +98,7 @@
 /* Modify the local functions below should you wish to use some other memory routines */
 /* for malloc(), free() */
 #include <stdlib.h>
+#include <stddef.h>     /* size_t */
 static void* XXH_malloc(size_t s) { return malloc(s); }
 static void  XXH_free  (void* p)  { free(p); }
 /* for memcpy() */
diff --git a/lib/common/zstd_common.c b/lib/common/zstd_common.c
index bccc948892d8..6f05d240e43c 100644
--- a/lib/common/zstd_common.c
+++ b/lib/common/zstd_common.c
@@ -46,11 +46,6 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
 
-/*! g_debuglog_enable :
- *  turn on/off debug traces (global switch) */
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 2)
-int g_debuglog_enable = 1;
-#endif
 
 
 /*=**************************************************************
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index 65c08a825706..e75adfa61323 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -21,6 +21,7 @@
 ***************************************/
 #include "compiler.h"
 #include "mem.h"
+#include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
 #include "error_private.h"
 #define ZSTD_STATIC_LINKING_ONLY
 #include "zstd.h"
@@ -38,43 +39,8 @@
 extern "C" {
 #endif
 
-
-/*-*************************************
-*  Debug
-***************************************/
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
-#  include <assert.h>
-#else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
-#endif
-
-#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
-
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
-#  include <stdio.h>
-extern int g_debuglog_enable;
-/* recommended values for ZSTD_DEBUG display levels :
- * 1 : no display, enables assert() only
- * 2 : reserved for currently active debug path
- * 3 : events once per object lifetime (CCtx, CDict, etc.)
- * 4 : events once per frame
- * 5 : events once per block
- * 6 : events once per sequence (*very* verbose) */
-#  define RAWLOG(l, ...) {                                      \
-                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
-                    fprintf(stderr, __VA_ARGS__);               \
-            }   }
-#  define DEBUGLOG(l, ...) {                                    \
-                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
-                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
-                    fprintf(stderr, " \n");                     \
-            }   }
-#else
-#  define RAWLOG(l, ...)      {}    /* disabled */
-#  define DEBUGLOG(l, ...)    {}    /* disabled */
-#endif
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
 
 
 /*-*************************************
@@ -113,8 +79,7 @@ static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
 static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
 static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
 
-#define ZSTD_FRAMEIDSIZE 4
-static const size_t ZSTD_frameIdSize = ZSTD_FRAMEIDSIZE;  /* magic number size */
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
 
 #define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
 static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
@@ -227,6 +192,8 @@ typedef struct {
     BYTE* llCode;
     BYTE* mlCode;
     BYTE* ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
     U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
     U32   longLengthPos;
 } seqStore_t;
diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c
index cb8f1fa3233e..4408f0ed5b50 100644
--- a/lib/compress/fse_compress.c
+++ b/lib/compress/fse_compress.c
@@ -1,6 +1,6 @@
 /* ******************************************************************
    FSE : Finite State Entropy encoder
-   Copyright (C) 2013-2015, Yann Collet.
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -37,9 +37,11 @@
 ****************************************************************/
 #include <stdlib.h>     /* malloc, free, qsort */
 #include <string.h>     /* memcpy, memset */
-#include <stdio.h>      /* printf (debug) */
-#include "bitstream.h"
 #include "compiler.h"
+#include "mem.h"        /* U32, U16, etc. */
+#include "debug.h"      /* assert, DEBUGLOG */
+#include "hist.h"       /* HIST_count_wksp */
+#include "bitstream.h"
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
 #include "error_private.h"
@@ -49,7 +51,6 @@
 *  Error Management
 ****************************************************************/
 #define FSE_isError ERR_isError
-#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 
 
 /* **************************************************************
@@ -82,7 +83,9 @@
  * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
  * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
  */
-size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
 {
     U32 const tableSize = 1 << tableLog;
     U32 const tableMask = tableSize - 1;
@@ -100,9 +103,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
     if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
     tableU16[-2] = (U16) tableLog;
     tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
 
     /* symbol start positions */
     {   U32 u;
@@ -122,13 +130,15 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
         U32 symbol;
         for (symbol=0; symbol<=maxSymbolValue; symbol++) {
             int nbOccurences;
-            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurences=0; nbOccurences<freq; nbOccurences++) {
                 tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
         }   }
 
-        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+        assert(position==0);  /* Must have initialized all positions */
     }
 
     /* Build table */
@@ -143,7 +153,10 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
         for (s=0; s<=maxSymbolValue; s++) {
             switch (normalizedCounter[s])
             {
-            case  0: break;
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
 
             case -1:
             case  1:
@@ -160,6 +173,18 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
                     total +=  normalizedCounter[s];
     }   }   }   }
 
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+        }
+    }
+#endif
+
     return 0;
 }
 
@@ -174,8 +199,9 @@ size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned
 
 #ifndef FSE_COMMONDEFS_ONLY
 
+
 /*-**************************************************************
-*  FSE NCount encoding-decoding
+*  FSE NCount encoding
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
@@ -183,9 +209,10 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
-static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned writeIsSafe)
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
 {
     BYTE* const ostart = (BYTE*) header;
     BYTE* out = ostart;
@@ -194,13 +221,12 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     const int tableSize = 1 << tableLog;
     int remaining;
     int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
 
-    bitStream = 0;
-    bitCount  = 0;
     /* Table Size */
     bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
     bitCount  += 4;
@@ -210,48 +236,53 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     threshold = tableSize;
     nbBits = tableLog+1;
 
-    while (remaining>1) {  /* stops at 1 */
-        if (previous0) {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24) {
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
                 start+=24;
                 bitStream += 0xFFFFU << bitCount;
-                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE) bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out+=2;
                 bitStream>>=16;
             }
-            while (charnum >= start+3) {
+            while (symbol >= start+3) {
                 start+=3;
                 bitStream += 3 << bitCount;
                 bitCount += 2;
             }
-            bitStream += (charnum-start) << bitCount;
+            bitStream += (symbol-start) << bitCount;
             bitCount += 2;
             if (bitCount>16) {
-                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
                 out[0] = (BYTE)bitStream;
                 out[1] = (BYTE)(bitStream>>8);
                 out += 2;
                 bitStream >>= 16;
                 bitCount -= 16;
         }   }
-        {   int count = normalizedCounter[charnum++];
-            int const max = (2*threshold-1)-remaining;
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
             remaining -= count < 0 ? -count : count;
             count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
             bitStream += count << bitCount;
             bitCount  += nbBits;
             bitCount  -= (count<max);
-            previous0  = (count==1);
+            previousIs0  = (count==1);
             if (remaining<1) return ERROR(GENERIC);
             while (remaining<threshold) { nbBits--; threshold>>=1; }
         }
         if (bitCount>16) {
-            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
             out[0] = (BYTE)bitStream;
             out[1] = (BYTE)(bitStream>>8);
             out += 2;
@@ -259,19 +290,23 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             bitCount -= 16;
     }   }
 
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
     /* flush remaining bitStream */
-    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
     out[0] = (BYTE)bitStream;
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
 
-    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
-
     return (out-ostart);
 }
 
 
-size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
@@ -279,179 +314,13 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized
     if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
         return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
 
-    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
 }
 
 
-
-/*-**************************************************************
-*  Counting histogram
-****************************************************************/
-/*! FSE_count_simple
-    This function counts byte values within `src`, and store the histogram into table `count`.
-    It doesn't use any additional memory.
-    But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
-    For this reason, prefer using a table `count` with 256 elements.
-    @return : count of most numerous element.
-*/
-size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
-                        const void* src, size_t srcSize)
-{
-    const BYTE* ip = (const BYTE*)src;
-    const BYTE* const end = ip + srcSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-
-    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
-    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
-
-    while (ip<end) {
-        assert(*ip <= maxSymbolValue);
-        count[*ip++]++;
-    }
-
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-
-    { U32 s; for (s=0; s<=maxSymbolValue; s++) if (count[s] > max) max = count[s]; }
-
-    return (size_t)max;
-}
-
-
-/* FSE_count_parallel_wksp() :
- * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
- * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`.
- * @return : largest histogram frequency, or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
-static size_t FSE_count_parallel_wksp(
-                                unsigned* count, unsigned* maxSymbolValuePtr,
-                                const void* source, size_t sourceSize,
-                                unsigned checkMax, unsigned* const workSpace)
-{
-    const BYTE* ip = (const BYTE*)source;
-    const BYTE* const iend = ip+sourceSize;
-    unsigned maxSymbolValue = *maxSymbolValuePtr;
-    unsigned max=0;
-    U32* const Counting1 = workSpace;
-    U32* const Counting2 = Counting1 + 256;
-    U32* const Counting3 = Counting2 + 256;
-    U32* const Counting4 = Counting3 + 256;
-
-    memset(workSpace, 0, 4*256*sizeof(unsigned));
-
-    /* safety checks */
-    if (!sourceSize) {
-        memset(count, 0, maxSymbolValue + 1);
-        *maxSymbolValuePtr = 0;
-        return 0;
-    }
-    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
-
-    /* by stripes of 16 bytes */
-    {   U32 cached = MEM_read32(ip); ip += 4;
-        while (ip < iend-15) {
-            U32 c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-            c = cached; cached = MEM_read32(ip); ip += 4;
-            Counting1[(BYTE) c     ]++;
-            Counting2[(BYTE)(c>>8) ]++;
-            Counting3[(BYTE)(c>>16)]++;
-            Counting4[       c>>24 ]++;
-        }
-        ip-=4;
-    }
-
-    /* finish last symbols */
-    while (ip<iend) Counting1[*ip++]++;
-
-    if (checkMax) {   /* verify stats will fit into destination table */
-        U32 s; for (s=255; s>maxSymbolValue; s--) {
-            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
-            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
-    }   }
-
-    {   U32 s;
-        if (maxSymbolValue > 255) maxSymbolValue = 255;
-        for (s=0; s<=maxSymbolValue; s++) {
-            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
-            if (count[s] > max) max = count[s];
-    }   }
-
-    while (!count[maxSymbolValue]) maxSymbolValue--;
-    *maxSymbolValuePtr = maxSymbolValue;
-    return (size_t)max;
-}
-
-/* FSE_countFast_wksp() :
- * Same as FSE_countFast(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= `1024` unsigned */
-size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                          const void* source, size_t sourceSize,
-                          unsigned* workSpace)
-{
-    if (sourceSize < 1500) /* heuristic threshold */
-        return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
-    return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
-}
-
-/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
-size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
-                     const void* source, size_t sourceSize)
-{
-    unsigned tmpCounters[1024];
-    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
-}
-
-/* FSE_count_wksp() :
- * Same as FSE_count(), but using an externally provided scratch buffer.
- * `workSpace` size must be table of >= `1024` unsigned */
-size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                 const void* source, size_t sourceSize, unsigned* workSpace)
-{
-    if (*maxSymbolValuePtr < 255)
-        return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
-    *maxSymbolValuePtr = 255;
-    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
-}
-
-size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr,
-                 const void* src, size_t srcSize)
-{
-    unsigned tmpCounters[1024];
-    return FSE_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
-}
-
-
-
 /*-**************************************************************
 *  FSE Compression Code
 ****************************************************************/
-/*! FSE_sizeof_CTable() :
-    FSE_CTable is a variable size structure which contains :
-    `U16 tableLog;`
-    `U16 maxSymbolValue;`
-    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
-    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
-Allocation is manual (C standard does not support variable-size structures).
-*/
-size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
-    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-}
 
 FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
 {
@@ -466,7 +335,7 @@ void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
     U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -529,6 +398,9 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
     }
     ToDistribute = (1 << tableLog) - distributed;
 
+    if (ToDistribute == 0)
+        return 0;
+
     if ((total / ToDistribute) > lowOne) {
         /* risk of rounding to zero */
         lowOne = (U32)((total * 3) / (ToDistribute * 2));
@@ -629,11 +501,11 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
         U32 s;
         U32 nTotal = 0;
         for (s=0; s<=maxSymbolValue; s++)
-            printf("%3i: %4i \n", s, normalizedCounter[s]);
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
         for (s=0; s<=maxSymbolValue; s++)
             nTotal += abs(normalizedCounter[s]);
         if (nTotal != (1U<<tableLog))
-            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
         getchar();
     }
 #endif
@@ -800,7 +672,7 @@ size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t src
     if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
+    {   CHECK_V_F(maxCount, HIST_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
         if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
         if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
         if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
@@ -835,7 +707,7 @@ typedef struct {
 size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
 {
     fseWkspMax_t scratchBuffer;
-    FSE_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
     return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
 }
diff --git a/lib/compress/hist.c b/lib/compress/hist.c
new file mode 100644
index 000000000000..16524756b8dc
--- /dev/null
+++ b/lib/compress/hist.c
@@ -0,0 +1,195 @@
+/* ******************************************************************
+   hist : Histogram functions
+   part of Finite State Entropy project
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* --- dependencies --- */
+#include "mem.h"             /* U32, BYTE, etc. */
+#include "debug.h"           /* assert, DEBUGLOG */
+#include "error_private.h"   /* ERROR */
+#include "hist.h"
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax,
+                                unsigned* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          unsigned* workSpace)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
+}
diff --git a/lib/compress/hist.h b/lib/compress/hist.h
new file mode 100644
index 000000000000..8b1991a90bd3
--- /dev/null
+++ b/lib/compress/hist.h
@@ -0,0 +1,92 @@
+/* ******************************************************************
+   hist : Histogram functions
+   part of Finite State Entropy project
+   Copyright (C) 2013-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* --- dependencies --- */
+#include <stddef.h>   /* size_t */
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ *  Provides the precise count of each byte within a table 'count'.
+ *  'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ *  @return : count of the most frequent symbol (which isn't identified).
+ *            or an error code, which can be tested using HIST_isError().
+ *            note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                  const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+/** HIST_count_wksp() :
+ *  Same as HIST_count(), but using an externally provided scratch buffer.
+ *  Benefit is this function will use very little stack space.
+ * `workSpace` must be a table of unsigned of size >= HIST_WKSP_SIZE_U32
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* src, size_t srcSize,
+                       unsigned* workSpace);
+
+/** HIST_countFast() :
+ *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                      const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ *  Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of unsigned of size >= HIST_WKSP_SIZE_U32
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize,
+                           unsigned* workSpace);
+
+/*! HIST_count_simple() :
+ *  Same as HIST_countFast(), this function is unsafe,
+ *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ *  It is also a bit slower for large inputs.
+ *  However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ *  Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize);
diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c
index 83230b415f9c..4c40572f2284 100644
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@@ -45,8 +45,9 @@
 ****************************************************************/
 #include <string.h>     /* memcpy, memset */
 #include <stdio.h>      /* printf (debug) */
-#include "bitstream.h"
 #include "compiler.h"
+#include "bitstream.h"
+#include "hist.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "fse.h"        /* header compression */
 #define HUF_STATIC_LINKING_ONLY
@@ -58,7 +59,7 @@
 *  Error Management
 ****************************************************************/
 #define HUF_isError ERR_isError
-#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
 #define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
 #define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
 
@@ -81,7 +82,7 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
  * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
  */
 #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
-size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -100,9 +101,9 @@ size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable,
     if (wtSize <= 1) return 0;  /* Not compressible */
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize) );
+    {   unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize);   /* never fails */
         if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
-        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount == 1) return 0;        /* each symbol present maximum once => not compressible */
     }
 
     tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
@@ -216,6 +217,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, U32* maxSymbolValuePtr, const void* src
     return readSize;
 }
 
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+{
+    const HUF_CElt* table = (const HUF_CElt*)symbolTable;
+    assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    return table[symbolValue].nbBits;
+}
+
 
 typedef struct nodeElt_s {
     U32 count;
@@ -660,9 +668,9 @@ static size_t HUF_compress_internal (
     }
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
-        if (largest <= (srcSize >> 7)+1) return 0;   /* heuristic : probably not compressible enough */
+        if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
     }
 
     /* Check validity of previous table */
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 2aa26da4cd43..5f6280a8f7cc 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -8,21 +8,13 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
-
-/*-*************************************
-*  Tuning parameters
-***************************************/
-#ifndef ZSTD_CLEVEL_DEFAULT
-#  define ZSTD_CLEVEL_DEFAULT 3
-#endif
-
-
 /*-*************************************
 *  Dependencies
 ***************************************/
 #include <string.h>         /* memset */
 #include "cpu.h"
 #include "mem.h"
+#include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
 #include "fse.h"
 #define HUF_STATIC_LINKING_ONLY
@@ -54,7 +46,6 @@ struct ZSTD_CDict_s {
     size_t workspaceSize;
     ZSTD_matchState_t matchState;
     ZSTD_compressedBlockState_t cBlockState;
-    ZSTD_compressionParameters cParams;
     ZSTD_customMem customMem;
     U32 dictID;
 };  /* typedef'd to ZSTD_CDict within "zstd.h" */
@@ -64,17 +55,26 @@ ZSTD_CCtx* ZSTD_createCCtx(void)
     return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
 }
 
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+    assert(cctx != NULL);
+    memset(cctx, 0, sizeof(*cctx));
+    cctx->customMem = memManager;
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    {   size_t const err = ZSTD_CCtx_resetParameters(cctx);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
 ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
 {
     ZSTD_STATIC_ASSERT(zcss_init==0);
     ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
-    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_calloc(sizeof(ZSTD_CCtx), customMem);
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
         if (!cctx) return NULL;
-        cctx->customMem = customMem;
-        cctx->requestedParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
-        cctx->requestedParams.fParams.contentSizeFlag = 1;
-        cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+        ZSTD_initCCtx(cctx, customMem);
         return cctx;
     }
 }
@@ -102,17 +102,24 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize)
     return cctx;
 }
 
-size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
 {
-    if (cctx==NULL) return 0;   /* support free on NULL */
-    if (cctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static CCtx */
+    assert(cctx != NULL);
+    assert(cctx->staticSize == 0);
     ZSTD_free(cctx->workSpace, cctx->customMem); cctx->workSpace = NULL;
     ZSTD_freeCDict(cctx->cdictLocal); cctx->cdictLocal = NULL;
 #ifdef ZSTD_MULTITHREAD
     ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
 #endif
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    if (cctx->staticSize) return ERROR(memory_allocation);   /* not compatible with static CCtx */
+    ZSTD_freeCCtxContent(cctx);
     ZSTD_free(cctx, cctx->customMem);
-    return 0;   /* reserved as a potential error code in the future */
+    return 0;
 }
 
 
@@ -143,21 +150,6 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
 /* private API call, for dictBuilder only */
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
 
-ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
-        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
-{
-    ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
-    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
-    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
-    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
-    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
-    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
-    if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength;
-    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
-    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
-    return cParams;
-}
-
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         ZSTD_compressionParameters cParams)
 {
@@ -251,7 +243,6 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_p_minMatch:
     case ZSTD_p_targetLength:
     case ZSTD_p_compressionStrategy:
-    case ZSTD_p_compressLiterals:
         return 1;
 
     case ZSTD_p_format:
@@ -268,6 +259,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_p_ldmMinMatch:
     case ZSTD_p_ldmBucketSizeLog:
     case ZSTD_p_ldmHashEveryLog:
+    case ZSTD_p_forceAttachDict:
     default:
         return 0;
     }
@@ -302,7 +294,6 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned v
         if (cctx->cdict) return ERROR(stage_wrong);
         return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
 
-    case ZSTD_p_compressLiterals:
     case ZSTD_p_contentSizeFlag:
     case ZSTD_p_checksumFlag:
     case ZSTD_p_dictIDFlag:
@@ -313,6 +304,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned v
                                    * default : 0 when using a CDict, 1 when using a Prefix */
         return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
 
+    case ZSTD_p_forceAttachDict:
+        return ZSTD_CCtxParam_setParameter(&cctx->requestedParams, param, value);
+
     case ZSTD_p_nbWorkers:
         if ((value>0) && cctx->staticSize) {
             return ERROR(parameter_unsupported);  /* MT not compatible with static alloc */
@@ -351,7 +345,6 @@ size_t ZSTD_CCtxParam_setParameter(
         int cLevel = (int)value;  /* cast expected to restore negative sign */
         if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
         if (cLevel) {  /* 0 : does not change current level */
-            CCtxParams->disableLiteralCompression = (cLevel<0);  /* negative levels disable huffman */
             CCtxParams->compressionLevel = cLevel;
         }
         if (CCtxParams->compressionLevel >= 0) return CCtxParams->compressionLevel;
@@ -399,10 +392,6 @@ size_t ZSTD_CCtxParam_setParameter(
         CCtxParams->cParams.strategy = (ZSTD_strategy)value;
         return (size_t)CCtxParams->cParams.strategy;
 
-    case ZSTD_p_compressLiterals:
-        CCtxParams->disableLiteralCompression = !value;
-        return !CCtxParams->disableLiteralCompression;
-
     case ZSTD_p_contentSizeFlag :
         /* Content size written in frame header _when known_ (default:1) */
         DEBUGLOG(4, "set content size flag = %u", (value>0));
@@ -423,6 +412,12 @@ size_t ZSTD_CCtxParam_setParameter(
         CCtxParams->forceWindow = (value > 0);
         return CCtxParams->forceWindow;
 
+    case ZSTD_p_forceAttachDict :
+        CCtxParams->attachDictPref = value ?
+                                    (value > 0 ? ZSTD_dictForceAttach : ZSTD_dictForceCopy) :
+                                     ZSTD_dictDefaultAttach;
+        return CCtxParams->attachDictPref;
+
     case ZSTD_p_nbWorkers :
 #ifndef ZSTD_MULTITHREAD
         if (value>0) return ERROR(parameter_unsupported);
@@ -477,6 +472,98 @@ size_t ZSTD_CCtxParam_setParameter(
     }
 }
 
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value)
+{
+    return ZSTD_CCtxParam_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParam_getParameter(
+        ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, unsigned* value)
+{
+    switch(param)
+    {
+    case ZSTD_p_format :
+        *value = CCtxParams->format;
+        break;
+    case ZSTD_p_compressionLevel :
+        *value = CCtxParams->compressionLevel;
+        break;
+    case ZSTD_p_windowLog :
+        *value = CCtxParams->cParams.windowLog;
+        break;
+    case ZSTD_p_hashLog :
+        *value = CCtxParams->cParams.hashLog;
+        break;
+    case ZSTD_p_chainLog :
+        *value = CCtxParams->cParams.chainLog;
+        break;
+    case ZSTD_p_searchLog :
+        *value = CCtxParams->cParams.searchLog;
+        break;
+    case ZSTD_p_minMatch :
+        *value = CCtxParams->cParams.searchLength;
+        break;
+    case ZSTD_p_targetLength :
+        *value = CCtxParams->cParams.targetLength;
+        break;
+    case ZSTD_p_compressionStrategy :
+        *value = (unsigned)CCtxParams->cParams.strategy;
+        break;
+    case ZSTD_p_contentSizeFlag :
+        *value = CCtxParams->fParams.contentSizeFlag;
+        break;
+    case ZSTD_p_checksumFlag :
+        *value = CCtxParams->fParams.checksumFlag;
+        break;
+    case ZSTD_p_dictIDFlag :
+        *value = !CCtxParams->fParams.noDictIDFlag;
+        break;
+    case ZSTD_p_forceMaxWindow :
+        *value = CCtxParams->forceWindow;
+        break;
+    case ZSTD_p_forceAttachDict :
+        *value = CCtxParams->attachDictPref;
+        break;
+    case ZSTD_p_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        assert(CCtxParams->nbWorkers == 0);
+#endif
+        *value = CCtxParams->nbWorkers;
+        break;
+    case ZSTD_p_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        *value = CCtxParams->jobSize;
+        break;
+#endif
+    case ZSTD_p_overlapSizeLog :
+#ifndef ZSTD_MULTITHREAD
+        return ERROR(parameter_unsupported);
+#else
+        *value = CCtxParams->overlapSizeLog;
+        break;
+#endif
+    case ZSTD_p_enableLongDistanceMatching :
+        *value = CCtxParams->ldmParams.enableLdm;
+        break;
+    case ZSTD_p_ldmHashLog :
+        *value = CCtxParams->ldmParams.hashLog;
+        break;
+    case ZSTD_p_ldmMinMatch :
+        *value = CCtxParams->ldmParams.minMatchLength;
+        break;
+    case ZSTD_p_ldmBucketSizeLog :
+        *value = CCtxParams->ldmParams.bucketSizeLog;
+        break;
+    case ZSTD_p_ldmHashEveryLog :
+        *value = CCtxParams->ldmParams.hashEveryLog;
+        break;
+    default: return ERROR(parameter_unsupported);
+    }
+    return 0;
+}
+
 /** ZSTD_CCtx_setParametersUsingCCtxParams() :
  *  just applies `params` into `cctx`
  *  no action is performed, parameters are merely stored.
@@ -487,6 +574,7 @@ size_t ZSTD_CCtxParam_setParameter(
 size_t ZSTD_CCtx_setParametersUsingCCtxParams(
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
 {
+    DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
     if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
     if (cctx->cdict) return ERROR(stage_wrong);
 
@@ -565,18 +653,19 @@ size_t ZSTD_CCtx_refPrefix_advanced(
     return 0;
 }
 
-static void ZSTD_startNewCompression(ZSTD_CCtx* cctx)
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+void ZSTD_CCtx_reset(ZSTD_CCtx* cctx)
 {
     cctx->streamStage = zcss_init;
     cctx->pledgedSrcSizePlusOne = 0;
 }
 
-/*! ZSTD_CCtx_reset() :
- *  Also dumps dictionary */
-void ZSTD_CCtx_reset(ZSTD_CCtx* cctx)
+size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx)
 {
-    ZSTD_startNewCompression(cctx);
+    if (cctx->streamStage != zcss_init) return ERROR(stage_wrong);
     cctx->cdict = NULL;
+    return ZSTD_CCtxParams_reset(&cctx->requestedParams);
 }
 
 /** ZSTD_checkCParams() :
@@ -589,8 +678,9 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
     CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
     CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
-    if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN)
-        return ERROR(parameter_unsupported);
+    ZSTD_STATIC_ASSERT(ZSTD_TARGETLENGTH_MIN == 0);
+    if (cParams.targetLength > ZSTD_TARGETLENGTH_MAX)
+        return ERROR(parameter_outOfBound);
     if ((U32)(cParams.strategy) > (U32)ZSTD_btultra)
         return ERROR(parameter_unsupported);
     return 0;
@@ -599,7 +689,8 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
 /** ZSTD_clampCParams() :
  *  make CParam values within valid range.
  *  @return : valid CParams */
-static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
 {
 #   define CLAMP(val,min,max) {      \
         if (val<min) val=min;        \
@@ -610,8 +701,10 @@ static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters c
     CLAMP(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMP(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
     CLAMP(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
-    if ((U32)(cParams.targetLength) < ZSTD_TARGETLENGTH_MIN) cParams.targetLength = ZSTD_TARGETLENGTH_MIN;
-    if ((U32)(cParams.strategy) > (U32)ZSTD_btultra) cParams.strategy = ZSTD_btultra;
+    ZSTD_STATIC_ASSERT(ZSTD_TARGETLENGTH_MIN == 0);
+    if (cParams.targetLength > ZSTD_TARGETLENGTH_MAX)
+        cParams.targetLength = ZSTD_TARGETLENGTH_MAX;
+    CLAMP(cParams.strategy, ZSTD_fast, ZSTD_btultra);
     return cParams;
 }
 
@@ -627,8 +720,11 @@ static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
     optimize `cPar` for a given input (`srcSize` and `dictSize`).
     mostly downsizing to reduce memory consumption and initialization latency.
     Both `srcSize` and `dictSize` are optional (use 0 if unknown).
-    Note : cPar is considered validated at this stage. Use ZSTD_checkCParams() to ensure that condition. */
-ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+    Note : cPar is assumed validated. Use ZSTD_checkCParams() to ensure this condition. */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                            unsigned long long srcSize,
+                            size_t dictSize)
 {
     static const U64 minSrcSize = 513; /* (1<<9) + 1 */
     static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
@@ -648,7 +744,7 @@ ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameter
                             ZSTD_highbit32(tSize-1) + 1;
         if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
     }
-    if (cPar.hashLog > cPar.windowLog) cPar.hashLog = cPar.windowLog;
+    if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1;
     {   U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
         if (cycleLog > cPar.windowLog)
             cPar.chainLog -= (cycleLog - cPar.windowLog);
@@ -660,13 +756,34 @@ ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameter
     return cPar;
 }
 
-ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+                   unsigned long long srcSize,
+                   size_t dictSize)
 {
     cPar = ZSTD_clampCParams(cPar);
     return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
 }
 
-static size_t ZSTD_sizeof_matchState(ZSTD_compressionParameters const* cParams, const U32 forCCtx)
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
+    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
+    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
+    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
+    if (CCtxParams->cParams.searchLength) cParams.searchLength = CCtxParams->cParams.searchLength;
+    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
+    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
+    assert(!ZSTD_checkCParams(cParams));
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const U32 forCCtx)
 {
     size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
     size_t const hSize = ((size_t)1) << cParams->hashLog;
@@ -693,7 +810,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
         U32    const divider = (cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const entropySpace = HUF_WORKSPACE_SIZE;
         size_t const blockStateSpace = 2 * sizeof(ZSTD_compressedBlockState_t);
         size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
@@ -752,12 +869,14 @@ size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
     return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
 }
 
-static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) {
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
     ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
     return ZSTD_estimateCStreamSize_usingCParams(cParams);
 }
 
-size_t ZSTD_estimateCStreamSize(int compressionLevel) {
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
     int level;
     size_t memBudget = 0;
     for (level=1; level<=compressionLevel; level++) {
@@ -786,9 +905,27 @@ ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
         fp.ingested = cctx->consumedSrcSize + buffered;
         fp.consumed = cctx->consumedSrcSize;
         fp.produced = cctx->producedCSize;
+        fp.flushed  = cctx->producedCSize;   /* simplified; some data might still be left within streaming output buffer */
+        fp.currentJobID = 0;
+        fp.nbActiveWorkers = 0;
         return fp;
 }   }
 
+/*! ZSTD_toFlushNow()
+ *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_toFlushNow(cctx->mtctx);
+    }
+#endif
+    (void)cctx;
+    return 0;   /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+
 
 static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1,
                                   ZSTD_compressionParameters cParams2)
@@ -799,6 +936,20 @@ static U32 ZSTD_equivalentCParams(ZSTD_compressionParameters cParams1,
          & ((cParams1.searchLength==3) == (cParams2.searchLength==3));  /* hashlog3 space */
 }
 
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+                                    ZSTD_compressionParameters cParams2)
+{
+    (void)cParams1;
+    (void)cParams2;
+    assert(cParams1.windowLog    == cParams2.windowLog);
+    assert(cParams1.chainLog     == cParams2.chainLog);
+    assert(cParams1.hashLog      == cParams2.hashLog);
+    assert(cParams1.searchLog    == cParams2.searchLog);
+    assert(cParams1.searchLength == cParams2.searchLength);
+    assert(cParams1.targetLength == cParams2.targetLength);
+    assert(cParams1.strategy     == cParams2.strategy);
+}
+
 /** The parameters are equivalent if ldm is not enabled in both sets or
  *  all the parameters are equivalent. */
 static U32 ZSTD_equivalentLdmParams(ldmParams_t ldmParams1,
@@ -817,33 +968,51 @@ typedef enum { ZSTDb_not_buffered, ZSTDb_buffered } ZSTD_buffered_policy_e;
 /* ZSTD_sufficientBuff() :
  * check internal buffers exist for streaming if buffPol == ZSTDb_buffered .
  * Note : they are assumed to be correctly sized if ZSTD_equivalentCParams()==1 */
-static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t blockSize1,
+static U32 ZSTD_sufficientBuff(size_t bufferSize1, size_t maxNbSeq1,
+                            size_t maxNbLit1,
                             ZSTD_buffered_policy_e buffPol2,
                             ZSTD_compressionParameters cParams2,
                             U64 pledgedSrcSize)
 {
     size_t const windowSize2 = MAX(1, (size_t)MIN(((U64)1 << cParams2.windowLog), pledgedSrcSize));
     size_t const blockSize2 = MIN(ZSTD_BLOCKSIZE_MAX, windowSize2);
+    size_t const maxNbSeq2 = blockSize2 / ((cParams2.searchLength == 3) ? 3 : 4);
+    size_t const maxNbLit2 = blockSize2;
     size_t const neededBufferSize2 = (buffPol2==ZSTDb_buffered) ? windowSize2 + blockSize2 : 0;
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is windowSize2=%u <= wlog1=%u",
-                (U32)windowSize2, cParams2.windowLog);
-    DEBUGLOG(4, "ZSTD_sufficientBuff: is blockSize2=%u <= blockSize1=%u",
-                (U32)blockSize2, (U32)blockSize1);
-    return (blockSize2 <= blockSize1) /* seqStore space depends on blockSize */
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is neededBufferSize2=%u <= bufferSize1=%u",
+                (U32)neededBufferSize2, (U32)bufferSize1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbSeq2=%u <= maxNbSeq1=%u",
+                (U32)maxNbSeq2, (U32)maxNbSeq1);
+    DEBUGLOG(4, "ZSTD_sufficientBuff: is maxNbLit2=%u <= maxNbLit1=%u",
+                (U32)maxNbLit2, (U32)maxNbLit1);
+    return (maxNbLit2 <= maxNbLit1)
+         & (maxNbSeq2 <= maxNbSeq1)
          & (neededBufferSize2 <= bufferSize1);
 }
 
 /** Equivalence for resetCCtx purposes */
 static U32 ZSTD_equivalentParams(ZSTD_CCtx_params params1,
                                  ZSTD_CCtx_params params2,
-                                 size_t buffSize1, size_t blockSize1,
+                                 size_t buffSize1,
+                                 size_t maxNbSeq1, size_t maxNbLit1,
                                  ZSTD_buffered_policy_e buffPol2,
                                  U64 pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_equivalentParams: pledgedSrcSize=%u", (U32)pledgedSrcSize);
-    return ZSTD_equivalentCParams(params1.cParams, params2.cParams) &&
-           ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams) &&
-           ZSTD_sufficientBuff(buffSize1, blockSize1, buffPol2, params2.cParams, pledgedSrcSize);
+    if (!ZSTD_equivalentCParams(params1.cParams, params2.cParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentCParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_equivalentLdmParams(params1.ldmParams, params2.ldmParams)) {
+      DEBUGLOG(4, "ZSTD_equivalentLdmParams() == 0");
+      return 0;
+    }
+    if (!ZSTD_sufficientBuff(buffSize1, maxNbSeq1, maxNbLit1, buffPol2,
+                             params2.cParams, pledgedSrcSize)) {
+      DEBUGLOG(4, "ZSTD_sufficientBuff() == 0");
+      return 0;
+    }
+    return 1;
 }
 
 static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
@@ -851,10 +1020,10 @@ static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
     int i;
     for (i = 0; i < ZSTD_REP_NUM; ++i)
         bs->rep[i] = repStartValue[i];
-    bs->entropy.hufCTable_repeatMode = HUF_repeat_none;
-    bs->entropy.offcode_repeatMode = FSE_repeat_none;
-    bs->entropy.matchlength_repeatMode = FSE_repeat_none;
-    bs->entropy.litlength_repeatMode = FSE_repeat_none;
+    bs->entropy.huf.repeatMode = HUF_repeat_none;
+    bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
 }
 
 /*! ZSTD_invalidateMatchState()
@@ -866,8 +1035,10 @@ static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
     ZSTD_window_clear(&ms->window);
 
     ms->nextToUpdate = ms->window.dictLimit + 1;
+    ms->nextToUpdate3 = ms->window.dictLimit + 1;
     ms->loadedDictEnd = 0;
     ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+    ms->dictMatchState = NULL;
 }
 
 /*! ZSTD_continueCCtx() :
@@ -880,6 +1051,7 @@ static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_CCtx_params params, U64 pl
 
     cctx->blockSize = blockSize;   /* previous block size could be different even for same windowLog, due to pledgedSrcSize */
     cctx->appliedParams = params;
+    cctx->blockState.matchState.cParams = params.cParams;
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
     cctx->consumedSrcSize = 0;
     cctx->producedCSize = 0;
@@ -900,7 +1072,11 @@ static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_CCtx_params params, U64 pl
 
 typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset } ZSTD_compResetPolicy_e;
 
-static void* ZSTD_reset_matchState(ZSTD_matchState_t* ms, void* ptr, ZSTD_compressionParameters const* cParams, ZSTD_compResetPolicy_e const crp, U32 const forCCtx)
+static void*
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+                      void* ptr,
+                const ZSTD_compressionParameters* cParams,
+                      ZSTD_compResetPolicy_e const crp, U32 const forCCtx)
 {
     size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
     size_t const hSize = ((size_t)1) << cParams->hashLog;
@@ -912,6 +1088,9 @@ static void* ZSTD_reset_matchState(ZSTD_matchState_t* ms, void* ptr, ZSTD_compre
 
     ms->hashLog3 = hashLog3;
     memset(&ms->window, 0, sizeof(ms->window));
+    ms->window.dictLimit = 1;    /* start from 1, so that 1st position is valid */
+    ms->window.lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */
+    ms->window.nextSrc = ms->window.base + 1;   /* see issue #1241 */
     ZSTD_invalidateMatchState(ms);
 
     /* opt parser space */
@@ -937,14 +1116,24 @@ static void* ZSTD_reset_matchState(ZSTD_matchState_t* ms, void* ptr, ZSTD_compre
     ms->hashTable3 = ms->chainTable + chainSize;
     ptr = ms->hashTable3 + h3Size;
 
+    ms->cParams = *cParams;
+
     assert(((size_t)ptr & 3) == 0);
     return ptr;
 }
 
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 /* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128  /* when workspace is continuously too large
+                                         * during at least this number of times,
+                                         * context's memory usage is considered wasteful,
+                                         * because it's sized to handle a worst case scenario which rarely happens.
+                                         * In which case, resize it down to free some memory */
+
 /*! ZSTD_resetCCtx_internal() :
     note : `params` are assumed fully validated at this stage */
 static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
-                                      ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                                      ZSTD_CCtx_params params,
+                                      U64 pledgedSrcSize,
                                       ZSTD_compResetPolicy_e const crp,
                                       ZSTD_buffered_policy_e const zbuff)
 {
@@ -954,34 +1143,35 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
     if (crp == ZSTDcrp_continue) {
         if (ZSTD_equivalentParams(zc->appliedParams, params,
-                                zc->inBuffSize, zc->blockSize,
-                                zbuff, pledgedSrcSize)) {
-            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%u)",
-                        zc->appliedParams.cParams.windowLog, (U32)zc->blockSize);
-            return ZSTD_continueCCtx(zc, params, pledgedSrcSize);
+                                  zc->inBuffSize,
+                                  zc->seqStore.maxNbSeq, zc->seqStore.maxNbLit,
+                                  zbuff, pledgedSrcSize)) {
+            DEBUGLOG(4, "ZSTD_equivalentParams()==1 -> continue mode (wLog1=%u, blockSize1=%zu)",
+                        zc->appliedParams.cParams.windowLog, zc->blockSize);
+            zc->workSpaceOversizedDuration += (zc->workSpaceOversizedDuration > 0);   /* if it was too large, it still is */
+            if (zc->workSpaceOversizedDuration <= ZSTD_WORKSPACETOOLARGE_MAXDURATION)
+                return ZSTD_continueCCtx(zc, params, pledgedSrcSize);
     }   }
     DEBUGLOG(4, "ZSTD_equivalentParams()==0 -> reset CCtx");
 
     if (params.ldmParams.enableLdm) {
         /* Adjust long distance matching parameters */
-        params.ldmParams.windowLog = params.cParams.windowLog;
         ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
         assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
         assert(params.ldmParams.hashEveryLog < 32);
-        zc->ldmState.hashPower =
-                ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
+        zc->ldmState.hashPower = ZSTD_ldm_getHashPower(params.ldmParams.minMatchLength);
     }
 
     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
         size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
         U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
         size_t const maxNbSeq = blockSize / divider;
-        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const tokenSpace = WILDCOPY_OVERLENGTH + blockSize + 11*maxNbSeq;
         size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
         size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
         size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
         size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);
-        void* ptr;
+        void* ptr;   /* used to partition workSpace */
 
         /* Check if workSpace is large enough, alloc a new one if needed */
         {   size_t const entropySpace = HUF_WORKSPACE_SIZE;
@@ -993,14 +1183,20 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
             size_t const neededSpace = entropySpace + blockStateSpace + ldmSpace +
                                        ldmSeqSpace + matchStateSize + tokenSpace +
                                        bufferSpace;
-            DEBUGLOG(4, "Need %uKB workspace, including %uKB for match state, and %uKB for buffers",
-                        (U32)(neededSpace>>10), (U32)(matchStateSize>>10), (U32)(bufferSpace>>10));
-            DEBUGLOG(4, "windowSize: %u - blockSize: %u", (U32)windowSize, (U32)blockSize);
 
-            if (zc->workSpaceSize < neededSpace) {  /* too small : resize */
-                DEBUGLOG(4, "Need to update workSpaceSize from %uK to %uK",
-                            (unsigned)(zc->workSpaceSize>>10),
-                            (unsigned)(neededSpace>>10));
+            int const workSpaceTooSmall = zc->workSpaceSize < neededSpace;
+            int const workSpaceTooLarge = zc->workSpaceSize > ZSTD_WORKSPACETOOLARGE_FACTOR * neededSpace;
+            int const workSpaceWasteful = workSpaceTooLarge && (zc->workSpaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION);
+            zc->workSpaceOversizedDuration = workSpaceTooLarge ? zc->workSpaceOversizedDuration+1 : 0;
+
+            DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers",
+                        neededSpace>>10, matchStateSize>>10, bufferSpace>>10);
+            DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+            if (workSpaceTooSmall || workSpaceWasteful) {
+                DEBUGLOG(4, "Need to resize workSpaceSize from %zuKB to %zuKB",
+                            zc->workSpaceSize >> 10,
+                            neededSpace >> 10);
                 /* static cctx : no resize, error out */
                 if (zc->staticSize) return ERROR(memory_allocation);
 
@@ -1009,9 +1205,11 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
                 zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
                 if (zc->workSpace == NULL) return ERROR(memory_allocation);
                 zc->workSpaceSize = neededSpace;
-                ptr = zc->workSpace;
+                zc->workSpaceOversizedDuration = 0;
 
-                /* Statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+                /* Statically sized space.
+                 * entropyWorkspace never moves,
+                 * though prev/next block swap places */
                 assert(((size_t)zc->workSpace & 3) == 0);   /* ensure correct alignment */
                 assert(zc->workSpaceSize >= 2 * sizeof(ZSTD_compressedBlockState_t));
                 zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)zc->workSpace;
@@ -1022,6 +1220,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         /* init params */
         zc->appliedParams = params;
+        zc->blockState.matchState.cParams = params.cParams;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1058,13 +1257,18 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         ptr = ZSTD_reset_matchState(&zc->blockState.matchState, ptr, &params.cParams, crp, /* forCCtx */ 1);
 
         /* sequences storage */
+        zc->seqStore.maxNbSeq = maxNbSeq;
         zc->seqStore.sequencesStart = (seqDef*)ptr;
         ptr = zc->seqStore.sequencesStart + maxNbSeq;
         zc->seqStore.llCode = (BYTE*) ptr;
         zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
         zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
         zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
-        ptr = zc->seqStore.litStart + blockSize;
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.maxNbLit = blockSize;
+        ptr = zc->seqStore.litStart + blockSize + WILDCOPY_OVERLENGTH;
 
         /* ldm bucketOffsets table */
         if (params.ldmParams.enableLdm) {
@@ -1098,28 +1302,110 @@ void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
     assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
 }
 
-static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[(unsigned)ZSTD_btultra+1] = {
+    8 KB, /* unused */
+    8 KB, /* ZSTD_fast */
+    16 KB, /* ZSTD_dfast */
+    32 KB, /* ZSTD_greedy */
+    32 KB, /* ZSTD_lazy */
+    32 KB, /* ZSTD_lazy2 */
+    32 KB, /* ZSTD_btlazy2 */
+    32 KB, /* ZSTD_btopt */
+    8 KB /* ZSTD_btultra */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+                                 ZSTD_CCtx_params params,
+                                 U64 pledgedSrcSize)
+{
+    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+    return ( pledgedSrcSize <= cutoff
+          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+          || params.attachDictPref == ZSTD_dictForceAttach )
+        && params.attachDictPref != ZSTD_dictForceCopy
+        && !params.forceWindow; /* dictMatchState isn't correctly
+                                 * handled in _enforceMaxDist */
+}
+
+static size_t ZSTD_resetCCtx_byAttachingCDict(
+    ZSTD_CCtx* cctx,
+    const ZSTD_CDict* cdict,
+    ZSTD_CCtx_params params,
+    U64 pledgedSrcSize,
+    ZSTD_buffered_policy_e zbuff)
+{
+    {
+        const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+        unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Resize working context table params for input only, since the dict
+         * has its own tables. */
+        params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0);
+        params.cParams.windowLog = windowLog;
+        ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                ZSTDcrp_continue, zbuff);
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+    }
+
+    {
+        const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
+                                  - cdict->matchState.window.base);
+        const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
+        if (cdictLen == 0) {
+            /* don't even attach dictionaries with no contents */
+            DEBUGLOG(4, "skipping attaching empty dictionary");
+        } else {
+            DEBUGLOG(4, "attaching dictionary into context");
+            cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+            /* prep working match state so dict matches never have negative indices
+             * when they are translated to the working context's index space. */
+            if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
+                cctx->blockState.matchState.window.nextSrc =
+                    cctx->blockState.matchState.window.base + cdictEnd;
+                ZSTD_window_clear(&cctx->blockState.matchState.window);
+            }
+            cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+        }
+    }
+
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                             const ZSTD_CDict* cdict,
-                            unsigned windowLog,
-                            ZSTD_frameParameters fParams,
+                            ZSTD_CCtx_params params,
                             U64 pledgedSrcSize,
                             ZSTD_buffered_policy_e zbuff)
 {
-    {   ZSTD_CCtx_params params = cctx->requestedParams;
+    const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+    DEBUGLOG(4, "copying dictionary into context");
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
         /* Copy only compression parameters related to tables. */
-        params.cParams = cdict->cParams;
-        if (windowLog) params.cParams.windowLog = windowLog;
-        params.fParams = fParams;
+        params.cParams = *cdict_cParams;
+        params.cParams.windowLog = windowLog;
         ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
                                 ZSTDcrp_noMemset, zbuff);
-        assert(cctx->appliedParams.cParams.strategy == cdict->cParams.strategy);
-        assert(cctx->appliedParams.cParams.hashLog == cdict->cParams.hashLog);
-        assert(cctx->appliedParams.cParams.chainLog == cdict->cParams.chainLog);
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
     }
 
     /* copy tables */
-    {   size_t const chainSize = (cdict->cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict->cParams.chainLog);
-        size_t const hSize =  (size_t)1 << cdict->cParams.hashLog;
+    {   size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+        size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
         size_t const tableSpace = (chainSize + hSize) * sizeof(U32);
         assert((U32*)cctx->blockState.matchState.chainTable == (U32*)cctx->blockState.matchState.hashTable + hSize);  /* chainTable must follow hashTable */
         assert((U32*)cctx->blockState.matchState.hashTable3 == (U32*)cctx->blockState.matchState.chainTable + chainSize);
@@ -1127,6 +1413,7 @@ static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
         assert((U32*)cdict->matchState.hashTable3 == (U32*)cdict->matchState.chainTable + chainSize);
         memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, tableSpace);   /* presumes all tables follow each other */
     }
+
     /* Zero the hashTable3, since the cdict never fills it */
     {   size_t const h3Size = (size_t)1 << cctx->blockState.matchState.hashLog3;
         assert(cdict->matchState.hashLog3 == 0);
@@ -1134,14 +1421,14 @@ static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
     }
 
     /* copy dictionary offsets */
-    {
-        ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
         ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
         dstMatchState->window       = srcMatchState->window;
         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
         dstMatchState->nextToUpdate3= srcMatchState->nextToUpdate3;
         dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
     }
+
     cctx->dictID = cdict->dictID;
 
     /* copy block state */
@@ -1150,6 +1437,27 @@ static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
     return 0;
 }
 
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", (U32)pledgedSrcSize);
+
+    if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+        return ZSTD_resetCCtx_byAttachingCDict(
+            cctx, cdict, params, pledgedSrcSize, zbuff);
+    } else {
+        return ZSTD_resetCCtx_byCopyingCDict(
+            cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+}
+
 /*! ZSTD_copyCCtx_internal() :
  *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
  *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
@@ -1192,7 +1500,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
 
     /* copy dictionary offsets */
     {
-        ZSTD_matchState_t const* srcMatchState = &srcCCtx->blockState.matchState;
+        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
         ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
         dstMatchState->window       = srcMatchState->window;
         dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
@@ -1294,15 +1602,15 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+static size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
     if (srcSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    MEM_writeLE24(dst, cBlockHeader24);
     memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
-    MEM_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
-    return ZSTD_blockHeaderSize+srcSize;
+    return ZSTD_blockHeaderSize + srcSize;
 }
 
-
 static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
@@ -1356,16 +1664,24 @@ static size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, cons
 }
 
 
-static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+static size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+    U32 const minlog = (strat==ZSTD_btultra) ? 7 : 6;
+    return (srcSize >> minlog) + 2;
+}
 
-static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy,
-                                     ZSTD_entropyCTables_t* nextEntropy,
+static size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
                                      ZSTD_strategy strategy, int disableLiteralCompression,
                                      void* dst, size_t dstCapacity,
                                const void* src, size_t srcSize,
                                      U32* workspace, const int bmi2)
 {
-    size_t const minGain = ZSTD_minGain(srcSize);
+    size_t const minGain = ZSTD_minGain(srcSize, strategy);
     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
     BYTE*  const ostart = (BYTE*)dst;
     U32 singleStream = srcSize < 256;
@@ -1376,27 +1692,25 @@ static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy,
                 disableLiteralCompression);
 
     /* Prepare nextEntropy assuming reusing the existing table */
-    nextEntropy->hufCTable_repeatMode = prevEntropy->hufCTable_repeatMode;
-    memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable,
-           sizeof(prevEntropy->hufCTable));
+    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
 
     if (disableLiteralCompression)
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
     /* small ? don't even attempt compression (speed opt) */
 #   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = (prevEntropy->hufCTable_repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
         if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
 
     if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
-    {   HUF_repeat repeat = prevEntropy->hufCTable_repeatMode;
+    {   HUF_repeat repeat = prevHuf->repeatMode;
         int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
         cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
-                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2)
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2)
                                 : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
-                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2);
+                                      workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
         if (repeat != HUF_repeat_none) {
             /* reused the existing table */
             hType = set_repeat;
@@ -1404,17 +1718,17 @@ static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy,
     }
 
     if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) {
-        memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable));
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
     if (cLitSize==1) {
-        memcpy(nextEntropy->hufCTable, prevEntropy->hufCTable, sizeof(prevEntropy->hufCTable));
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
         return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
     }
 
     if (hType == set_compressed) {
         /* using a newly constructed table */
-        nextEntropy->hufCTable_repeatMode = HUF_repeat_check;
+        nextHuf->repeatMode = HUF_repeat_check;
     }
 
     /* Build header */
@@ -1451,6 +1765,7 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
         U32 const mlv = sequences[u].matchLength;
@@ -1464,61 +1779,234 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
 }
 
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabiltyLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabiltyLog256[norm];
+    }
+    return cost >> 8;
+}
+
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+static size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                                    unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = norm[s] != -1 ? norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabiltyLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+static size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    CHECK_F(FSE_normalizeCount(norm, tableLog, count, nbSeq, max));
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+
 typedef enum {
     ZSTD_defaultDisallowed = 0,
     ZSTD_defaultAllowed = 1
 } ZSTD_defaultPolicy_e;
 
-MEM_STATIC
-symbolEncodingType_e ZSTD_selectEncodingType(
-        FSE_repeat* repeatMode, size_t const mostFrequent, size_t nbSeq,
-        U32 defaultNormLog, ZSTD_defaultPolicy_e const isDefaultAllowed)
+MEM_STATIC symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
 {
-#define MIN_SEQ_FOR_DYNAMIC_FSE   64
-#define MAX_SEQ_FOR_STATIC_FSE  1000
     ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
-    if ((mostFrequent == nbSeq) && (!isDefaultAllowed || nbSeq > 2)) {
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
         DEBUGLOG(5, "Selected set_rle");
-        /* Prefer set_basic over set_rle when there are 2 or less symbols,
-         * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
-         * If basic encoding isn't possible, always choose RLE.
-         */
-        *repeatMode = FSE_repeat_check;
         return set_rle;
     }
-    if ( isDefaultAllowed
-      && (*repeatMode == FSE_repeat_valid) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
-        DEBUGLOG(5, "Selected set_repeat");
-        return set_repeat;
-    }
-    if ( isDefaultAllowed
-      && ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (defaultNormLog-1)))) ) {
-        DEBUGLOG(5, "Selected set_basic");
-        /* The format allows default tables to be repeated, but it isn't useful.
-         * When using simple heuristics to select encoding type, we don't want
-         * to confuse these tables with dictionaries. When running more careful
-         * analysis, we don't need to waste time checking both repeating tables
-         * and default tables.
-         */
-        *repeatMode = FSE_repeat_none;
-        return set_basic;
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (U32)basicCost, (U32)repeatCost, (U32)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
     }
     DEBUGLOG(5, "Selected set_compressed");
     *repeatMode = FSE_repeat_check;
     return set_compressed;
 }
 
-MEM_STATIC
-size_t ZSTD_buildCTable(void* dst, size_t dstCapacity,
-        FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
-        U32* count, U32 max,
-        BYTE const* codeTable, size_t nbSeq,
-        S16 const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-        FSE_CTable const* prevCTable, size_t prevCTableSize,
-        void* workspace, size_t workspaceSize)
+MEM_STATIC size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                U32* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* workspace, size_t workspaceSize)
 {
     BYTE* op = (BYTE*)dst;
-    BYTE const* const oend = op + dstCapacity;
+    const BYTE* const oend = op + dstCapacity;
 
     switch (type) {
     case set_rle:
@@ -1674,7 +2162,7 @@ ZSTD_encodeSequences_bmi2(
 
 #endif
 
-size_t ZSTD_encodeSequences(
+static size_t ZSTD_encodeSequences(
             void* dst, size_t dstCapacity,
             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
             FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
@@ -1706,10 +2194,11 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
                               const int bmi2)
 {
     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
     U32 count[MaxSeq+1];
-    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
-    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
-    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
     U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
     const seqDef* const sequences = seqStorePtr->sequencesStart;
     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
@@ -1720,15 +2209,17 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
     BYTE* op = ostart;
     size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
     BYTE* seqHead;
+    BYTE* lastNCount = NULL;
 
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
 
     /* Compress literals */
     {   const BYTE* const literals = seqStorePtr->litStart;
         size_t const litSize = seqStorePtr->lit - literals;
+        int const disableLiteralCompression = (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
         size_t const cSize = ZSTD_compressLiterals(
-                                    prevEntropy, nextEntropy,
-                                    cctxParams->cParams.strategy, cctxParams->disableLiteralCompression,
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy, disableLiteralCompression,
                                     op, dstCapacity,
                                     literals, litSize,
                                     workspace, bmi2);
@@ -1747,13 +2238,9 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
     else
         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
     if (nbSeq==0) {
-      memcpy(nextEntropy->litlengthCTable, prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable));
-      nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
-      memcpy(nextEntropy->offcodeCTable, prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable));
-      nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
-      memcpy(nextEntropy->matchlengthCTable, prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable));
-      nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
-      return op - ostart;
+        /* Copy the old tables over as if we repeated them */
+        memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+        return op - ostart;
     }
 
     /* seqHead : flags for FSE encoding type */
@@ -1763,43 +2250,53 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
     ZSTD_seqToCodes(seqStorePtr);
     /* build CTable for Literal Lengths */
     {   U32 max = MaxLL;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);   /* can't fail */
         DEBUGLOG(5, "Building LL table");
-        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
-        LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, mostFrequent, nbSeq, LL_defaultNormLog, ZSTD_defaultAllowed);
+        nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, count, max, mostFrequent, nbSeq, LLFSELog, prevEntropy->fse.litlengthCTable, LL_defaultNorm, LL_defaultNormLog, ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
         {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
-                    count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                    prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable),
-                    workspace, HUF_WORKSPACE_SIZE);
+                                                    count, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                                    prevEntropy->fse.litlengthCTable, sizeof(prevEntropy->fse.litlengthCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
             if (ZSTD_isError(countSize)) return countSize;
+            if (LLtype == set_compressed)
+                lastNCount = op;
             op += countSize;
     }   }
     /* build CTable for Offsets */
     {   U32 max = MaxOff;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);  /* can't fail */
         /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
         ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
         DEBUGLOG(5, "Building OF table");
-        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
-        Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, mostFrequent, nbSeq, OF_defaultNormLog, defaultPolicy);
+        nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, count, max, mostFrequent, nbSeq, OffFSELog, prevEntropy->fse.offcodeCTable, OF_defaultNorm, OF_defaultNormLog, defaultPolicy, strategy);
+        assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
         {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
-                    count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                    prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable),
-                    workspace, HUF_WORKSPACE_SIZE);
+                                                    count, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                                    prevEntropy->fse.offcodeCTable, sizeof(prevEntropy->fse.offcodeCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
             if (ZSTD_isError(countSize)) return countSize;
+            if (Offtype == set_compressed)
+                lastNCount = op;
             op += countSize;
     }   }
     /* build CTable for MatchLengths */
     {   U32 max = MaxML;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);   /* can't fail */
         DEBUGLOG(5, "Building ML table");
-        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
-        MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, mostFrequent, nbSeq, ML_defaultNormLog, ZSTD_defaultAllowed);
+        nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, count, max, mostFrequent, nbSeq, MLFSELog, prevEntropy->fse.matchlengthCTable, ML_defaultNorm, ML_defaultNormLog, ZSTD_defaultAllowed, strategy);
+        assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
         {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
-                    count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
-                    prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable),
-                    workspace, HUF_WORKSPACE_SIZE);
+                                                    count, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                                    prevEntropy->fse.matchlengthCTable, sizeof(prevEntropy->fse.matchlengthCTable),
+                                                    workspace, HUF_WORKSPACE_SIZE);
             if (ZSTD_isError(countSize)) return countSize;
+            if (MLtype == set_compressed)
+                lastNCount = op;
             op += countSize;
     }   }
 
@@ -1814,21 +2311,37 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
                                         longOffsets, bmi2);
         if (ZSTD_isError(bitstreamSize)) return bitstreamSize;
         op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() recieves a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (lastNCount && (op - lastNCount) < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(op - lastNCount == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
     }
 
     return op - ostart;
 }
 
 MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
-                              ZSTD_entropyCTables_t const* prevEntropy,
+                        const ZSTD_entropyCTables_t* prevEntropy,
                               ZSTD_entropyCTables_t* nextEntropy,
-                              ZSTD_CCtx_params const* cctxParams,
+                        const ZSTD_CCtx_params* cctxParams,
                               void* dst, size_t dstCapacity,
                               size_t srcSize, U32* workspace, int bmi2)
 {
     size_t const cSize = ZSTD_compressSequences_internal(
             seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity,
             workspace, bmi2);
+    if (cSize == 0) return 0;
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
@@ -1837,40 +2350,55 @@ MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr,
     if (ZSTD_isError(cSize)) return cSize;
 
     /* Check compressibility */
-    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize);  /* note : fixed formula, maybe should depend on compression level, or strategy */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
 
-    /* We check that dictionaries have offset codes available for the first
-     * block. After the first block, the offcode table might not have large
-     * enough codes to represent the offsets in the data.
-     */
-    if (nextEntropy->offcode_repeatMode == FSE_repeat_valid)
-        nextEntropy->offcode_repeatMode = FSE_repeat_check;
-
     return cSize;
 }
 
 /* ZSTD_selectBlockCompressor() :
  * Not static, but internal use only (used by long distance matcher)
  * assumption : strat is a valid strategy */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode)
 {
-    static const ZSTD_blockCompressor blockCompressor[2][(unsigned)ZSTD_btultra+1] = {
+    static const ZSTD_blockCompressor blockCompressor[3][(unsigned)ZSTD_btultra+1] = {
         { ZSTD_compressBlock_fast  /* default for 0 */,
-          ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy,
-          ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2,
-          ZSTD_compressBlock_btopt, ZSTD_compressBlock_btultra },
+          ZSTD_compressBlock_fast,
+          ZSTD_compressBlock_doubleFast,
+          ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy,
+          ZSTD_compressBlock_lazy2,
+          ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt,
+          ZSTD_compressBlock_btultra },
         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
-          ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict,
-          ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict,
-          ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btultra_extDict }
+          ZSTD_compressBlock_fast_extDict,
+          ZSTD_compressBlock_doubleFast_extDict,
+          ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,
+          ZSTD_compressBlock_lazy2_extDict,
+          ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict,
+          ZSTD_compressBlock_btultra_extDict },
+        { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+          ZSTD_compressBlock_fast_dictMatchState,
+          ZSTD_compressBlock_doubleFast_dictMatchState,
+          ZSTD_compressBlock_greedy_dictMatchState,
+          ZSTD_compressBlock_lazy_dictMatchState,
+          ZSTD_compressBlock_lazy2_dictMatchState,
+          ZSTD_compressBlock_btlazy2_dictMatchState,
+          ZSTD_compressBlock_btopt_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState }
     };
+    ZSTD_blockCompressor selectedCompressor;
     ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
 
     assert((U32)strat >= (U32)ZSTD_fast);
     assert((U32)strat <= (U32)ZSTD_btultra);
-    return blockCompressor[extDict!=0][(U32)strat];
+    selectedCompressor = blockCompressor[(int)dictMode][(U32)strat];
+    assert(selectedCompressor != NULL);
+    return selectedCompressor;
 }
 
 static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
@@ -1880,7 +2408,7 @@ static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
     seqStorePtr->lit += lastLLSize;
 }
 
-static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
 {
     ssPtr->lit = ssPtr->litStart;
     ssPtr->sequences = ssPtr->sequencesStart;
@@ -1892,24 +2420,38 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                                         const void* src, size_t srcSize)
 {
     ZSTD_matchState_t* const ms = &zc->blockState.matchState;
-    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
-                (U32)dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+    size_t cSize;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%zu, dictLimit=%u, nextToUpdate=%u)",
+                dstCapacity, ms->window.dictLimit, ms->nextToUpdate);
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+
+    /* Assert that we have correctly flushed the ctx params into the ms's copy */
+    ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+
     if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
         ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.searchLength);
-        return 0;   /* don't even attempt compression below a certain srcSize */
+        cSize = 0;
+        goto out;  /* don't even attempt compression below a certain srcSize */
     }
     ZSTD_resetSeqStore(&(zc->seqStore));
+    ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;   /* required for optimal parser to read stats from dictionary */
+
+    /* a gap between an attached dict and the current window is not safe,
+     * they must remain adjacent, and when that stops being the case, the dict
+     * must be unset */
+    assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
 
     /* limited update after a very long match */
     {   const BYTE* const base = ms->window.base;
         const BYTE* const istart = (const BYTE*)src;
         const U32 current = (U32)(istart-base);
+        if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1));   /* ensure no overflow */
         if (current > ms->nextToUpdate + 384)
             ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384));
     }
 
     /* select and store sequences */
-    {   U32 const extDict = ZSTD_window_hasExtDict(ms->window);
+    {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
         size_t lastLLSize;
         {   int i;
             for (i = 0; i < ZSTD_REP_NUM; ++i)
@@ -1922,8 +2464,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
-                                       &zc->appliedParams.cParams,
-                                       src, srcSize, extDict);
+                                       src, srcSize);
             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
         } else if (zc->appliedParams.ldmParams.enableLdm) {
             rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
@@ -1939,31 +2480,38 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                 ZSTD_ldm_blockCompress(&ldmSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
-                                       &zc->appliedParams.cParams,
-                                       src, srcSize, extDict);
+                                       src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
         } else {   /* not long range mode */
-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, extDict);
-            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, &zc->appliedParams.cParams, src, srcSize);
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
         }
         {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
             ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
     }   }
 
     /* encode sequences and literals */
-    {   size_t const cSize = ZSTD_compressSequences(&zc->seqStore,
-                                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
-                                &zc->appliedParams,
-                                dst, dstCapacity,
-                                srcSize, zc->entropyWorkspace, zc->bmi2);
-        if (ZSTD_isError(cSize) || cSize == 0) return cSize;
-        /* confirm repcodes and entropy tables */
-        {   ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
-            zc->blockState.prevCBlock = zc->blockState.nextCBlock;
-            zc->blockState.nextCBlock = tmp;
-        }
-        return cSize;
+    cSize = ZSTD_compressSequences(&zc->seqStore,
+            &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            srcSize, zc->entropyWorkspace, zc->bmi2);
+
+out:
+    if (!ZSTD_isError(cSize) && cSize != 0) {
+        /* confirm repcodes and entropy tables when emitting a compressed block */
+        ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+        zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+        zc->blockState.nextCBlock = tmp;
     }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
 }
 
 
@@ -2005,13 +2553,13 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
             ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
             ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
             ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
-
             ZSTD_reduceIndex(cctx, correction);
             if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
             else ms->nextToUpdate -= correction;
             ms->loadedDictEnd = 0;
+            ms->dictMatchState = NULL;
         }
-        ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd);
+        ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
         if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
 
         {   size_t cSize = ZSTD_compressBlock_internal(cctx,
@@ -2020,11 +2568,8 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
             if (ZSTD_isError(cSize)) return cSize;
 
             if (cSize == 0) {  /* block is not compressible */
-                U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3);
-                if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
-                MEM_writeLE32(op, cBlockHeader24);   /* 4th byte will be overwritten */
-                memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
-                cSize = ZSTD_blockHeaderSize + blockSize;
+                cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+                if (ZSTD_isError(cSize)) return cSize;
             } else {
                 U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
                 MEM_writeLE24(op, cBlockHeader24);
@@ -2060,6 +2605,7 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
     BYTE  const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
     size_t pos=0;
 
+    assert(!(params.fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
     if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall);
     DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
                 !params.fParams.noDictIDFlag, dictID,  dictIDSizeCode);
@@ -2122,7 +2668,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
                         const void* src, size_t srcSize,
                                U32 frame, U32 lastFrameChunk)
 {
-    ZSTD_matchState_t* ms = &cctx->blockState.matchState;
+    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
     size_t fhSize = 0;
 
     DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
@@ -2143,8 +2689,25 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
     if (!ZSTD_window_update(&ms->window, src, srcSize)) {
         ms->nextToUpdate = ms->window.dictLimit;
     }
-    if (cctx->appliedParams.ldmParams.enableLdm)
+    if (cctx->appliedParams.ldmParams.enableLdm) {
         ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+    }
+
+    if (!frame) {
+        /* overflow check and correction for block mode */
+        if (ZSTD_window_needOverflowCorrection(ms->window, (const char*)src + srcSize)) {
+            U32 const cycleLog = ZSTD_cycleLog(cctx->appliedParams.cParams.chainLog, cctx->appliedParams.cParams.strategy);
+            U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, 1 << cctx->appliedParams.cParams.windowLog, src);
+            ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+            ZSTD_reduceIndex(cctx, correction);
+            if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+            else ms->nextToUpdate -= correction;
+            ms->loadedDictEnd = 0;
+            ms->dictMatchState = NULL;
+        }
+    }
 
     DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (U32)cctx->blockSize);
     {   size_t const cSize = frame ?
@@ -2153,7 +2716,9 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
         if (ZSTD_isError(cSize)) return cSize;
         cctx->consumedSrcSize += srcSize;
         cctx->producedCSize += (cSize + fhSize);
-        if (cctx->appliedParams.fParams.contentSizeFlag) {  /* control src size */
+        assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+        if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+            ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
             if (cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne) {
                 DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize >= %u",
                     (U32)cctx->pledgedSrcSizePlusOne-1, (U32)cctx->consumedSrcSize);
@@ -2184,44 +2749,50 @@ size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const
 {
     size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
     if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
+
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
 /*! ZSTD_loadDictionaryContent() :
  *  @return : 0, or an error code
  */
-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* src, size_t srcSize)
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                         ZSTD_CCtx_params const* params,
+                                         const void* src, size_t srcSize,
+                                         ZSTD_dictTableLoadMethod_e dtlm)
 {
     const BYTE* const ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
-    ZSTD_compressionParameters const* cParams = &params->cParams;
 
     ZSTD_window_update(&ms->window, src, srcSize);
     ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
 
+    /* Assert that we the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, cParams, iend);
+        ZSTD_fillHashTable(ms, iend, dtlm);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, cParams, iend);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
         break;
 
     case ZSTD_greedy:
     case ZSTD_lazy:
     case ZSTD_lazy2:
         if (srcSize >= HASH_READ_SIZE)
-            ZSTD_insertAndFindFirstIndex(ms, cParams, iend-HASH_READ_SIZE);
+            ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
         break;
 
     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
     case ZSTD_btopt:
     case ZSTD_btultra:
         if (srcSize >= HASH_READ_SIZE)
-            ZSTD_updateTree(ms, cParams, iend-HASH_READ_SIZE, iend);
+            ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
         break;
 
     default:
@@ -2256,7 +2827,12 @@ static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSym
  *  assumptions : magic number supposed already checked
  *                dictSize supposed > 8
  */
-static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, void* workspace)
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      void* workspace)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
@@ -2265,13 +2841,15 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matc
     size_t dictID;
 
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize > 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
 
     dictPtr += 4;   /* skip magic number */
     dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr);
     dictPtr += 4;
 
     {   unsigned maxSymbolValue = 255;
-        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.hufCTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
         if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
         if (maxSymbolValue < 255) return ERROR(dictionary_corrupted);
         dictPtr += hufHeaderSize;
@@ -2282,7 +2860,8 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matc
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
         /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
+        /* fill all offset symbols to avoid garbage at end of table */
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += offcodeHeaderSize;
     }
@@ -2294,7 +2873,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matc
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
         /* Every match length code must have non-zero probability */
         CHECK_F( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE),
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += matchlengthHeaderSize;
     }
@@ -2306,7 +2885,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matc
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
         /* Every literal length code must have non-zero probability */
         CHECK_F( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
-        CHECK_E( FSE_buildCTable_wksp(bs->entropy.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE),
+        CHECK_E( FSE_buildCTable_wksp(bs->entropy.fse.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE),
                  dictionary_corrupted);
         dictPtr += litlengthHeaderSize;
     }
@@ -2332,22 +2911,25 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matc
                 if (bs->rep[u] > dictContentSize) return ERROR(dictionary_corrupted);
         }   }
 
-        bs->entropy.hufCTable_repeatMode = HUF_repeat_valid;
-        bs->entropy.offcode_repeatMode = FSE_repeat_valid;
-        bs->entropy.matchlength_repeatMode = FSE_repeat_valid;
-        bs->entropy.litlength_repeatMode = FSE_repeat_valid;
-        CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize));
+        bs->entropy.huf.repeatMode = HUF_repeat_valid;
+        bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+        CHECK_F(ZSTD_loadDictionaryContent(ms, params, dictPtr, dictContentSize, dtlm));
         return dictID;
     }
 }
 
 /** ZSTD_compress_insertDictionary() :
 *   @return : dictID, or an error code */
-static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZSTD_matchState_t* ms,
-                                             ZSTD_CCtx_params const* params,
-                                       const void* dict, size_t dictSize,
-                                             ZSTD_dictContentType_e dictContentType,
-                                             void* workspace)
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                               ZSTD_matchState_t* ms,
+                         const ZSTD_CCtx_params* params,
+                         const void* dict, size_t dictSize,
+                               ZSTD_dictContentType_e dictContentType,
+                               ZSTD_dictTableLoadMethod_e dtlm,
+                               void* workspace)
 {
     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
     if ((dict==NULL) || (dictSize<=8)) return 0;
@@ -2356,12 +2938,12 @@ static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZS
 
     /* dict restricted modes */
     if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, params, dict, dictSize);
+        return ZSTD_loadDictionaryContent(ms, params, dict, dictSize, dtlm);
 
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictContentType == ZSTD_dct_auto) {
             DEBUGLOG(4, "raw content dictionary detected");
-            return ZSTD_loadDictionaryContent(ms, params, dict, dictSize);
+            return ZSTD_loadDictionaryContent(ms, params, dict, dictSize, dtlm);
         }
         if (dictContentType == ZSTD_dct_fullDict)
             return ERROR(dictionary_wrong);
@@ -2369,17 +2951,18 @@ static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, ZS
     }
 
     /* dict as full zstd dictionary */
-    return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, workspace);
+    return ZSTD_loadZstdDictionary(bs, ms, params, dict, dictSize, dtlm, workspace);
 }
 
 /*! ZSTD_compressBegin_internal() :
  * @return : 0, or an error code */
-size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
-                             const void* dict, size_t dictSize,
-                             ZSTD_dictContentType_e dictContentType,
-                             const ZSTD_CDict* cdict,
-                             ZSTD_CCtx_params params, U64 pledgedSrcSize,
-                             ZSTD_buffered_policy_e zbuff)
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    ZSTD_CCtx_params params, U64 pledgedSrcSize,
+                                    ZSTD_buffered_policy_e zbuff)
 {
     DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params.cParams.windowLog);
     /* params are supposed to be fully validated at this point */
@@ -2387,9 +2970,7 @@ size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
     if (cdict && cdict->dictContentSize>0) {
-        cctx->requestedParams = params;
-        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params.cParams.windowLog,
-                                         params.fParams, pledgedSrcSize, zbuff);
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
     }
 
     CHECK_F( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
@@ -2397,7 +2978,7 @@ size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
     {
         size_t const dictID = ZSTD_compress_insertDictionary(
                 cctx->blockState.prevCBlock, &cctx->blockState.matchState,
-                &params, dict, dictSize, dictContentType, cctx->entropyWorkspace);
+                &params, dict, dictSize, dictContentType, dtlm, cctx->entropyWorkspace);
         if (ZSTD_isError(dictID)) return dictID;
         assert(dictID <= (size_t)(U32)-1);
         cctx->dictID = (U32)dictID;
@@ -2408,6 +2989,7 @@ size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
 size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
                                     ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
                                     const ZSTD_CDict* cdict,
                                     ZSTD_CCtx_params params,
                                     unsigned long long pledgedSrcSize)
@@ -2416,7 +2998,7 @@ size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
     /* compression parameters verification and optimization */
     CHECK_F( ZSTD_checkCParams(params.cParams) );
     return ZSTD_compressBegin_internal(cctx,
-                                       dict, dictSize, dictContentType,
+                                       dict, dictSize, dictContentType, dtlm,
                                        cdict,
                                        params, pledgedSrcSize,
                                        ZSTDb_not_buffered);
@@ -2431,7 +3013,7 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
     ZSTD_CCtx_params const cctxParams =
             ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     return ZSTD_compressBegin_advanced_internal(cctx,
-                                            dict, dictSize, ZSTD_dct_auto,
+                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
                                             NULL /*cdict*/,
                                             cctxParams, pledgedSrcSize);
 }
@@ -2442,7 +3024,7 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
     ZSTD_CCtx_params const cctxParams =
             ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (U32)dictSize);
-    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL,
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                                        cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
 }
 
@@ -2505,7 +3087,9 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
     if (ZSTD_isError(cSize)) return cSize;
     endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
     if (ZSTD_isError(endResult)) return endResult;
-    if (cctx->appliedParams.fParams.contentSizeFlag) {  /* control src size */
+    assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+    if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+        ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
         DEBUGLOG(4, "end of frame : controlling src size");
         if (cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1) {
             DEBUGLOG(4, "error : pledgedSrcSize = %u, while realSrcSize = %u",
@@ -2517,22 +3101,22 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
 
 
 static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
-                               void* dst, size_t dstCapacity,
-                         const void* src, size_t srcSize,
-                         const void* dict,size_t dictSize,
-                               ZSTD_parameters params)
+                                      void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const void* dict,size_t dictSize,
+                                      ZSTD_parameters params)
 {
     ZSTD_CCtx_params const cctxParams =
             ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     DEBUGLOG(4, "ZSTD_compress_internal");
     return ZSTD_compress_advanced_internal(cctx,
-                                          dst, dstCapacity,
-                                          src, srcSize,
-                                          dict, dictSize,
-                                          cctxParams);
+                                           dst, dstCapacity,
+                                           src, srcSize,
+                                           dict, dictSize,
+                                           cctxParams);
 }
 
-size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize,
                          const void* dict,size_t dictSize,
@@ -2540,7 +3124,11 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
 {
     DEBUGLOG(4, "ZSTD_compress_advanced");
     CHECK_F(ZSTD_checkCParams(params.cParams));
-    return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+    return ZSTD_compress_internal(cctx,
+                                  dst, dstCapacity,
+                                  src, srcSize,
+                                  dict, dictSize,
+                                  params);
 }
 
 /* Internal */
@@ -2551,37 +3139,44 @@ size_t ZSTD_compress_advanced_internal(
         const void* dict,size_t dictSize,
         ZSTD_CCtx_params params)
 {
-    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)",
-                (U32)srcSize);
-    CHECK_F( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, NULL,
-                                         params, srcSize, ZSTDb_not_buffered) );
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (U32)srcSize);
+    CHECK_F( ZSTD_compressBegin_internal(cctx,
+                         dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                         params, srcSize, ZSTDb_not_buffered) );
     return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
 }
 
-size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize,
-                               const void* dict, size_t dictSize, int compressionLevel)
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict, size_t dictSize,
+                               int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize ? srcSize : 1, dict ? dictSize : 0);
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize + (!srcSize), dict ? dictSize : 0);
     ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(cctx->requestedParams, params);
     assert(params.fParams.contentSizeFlag == 1);
-    ZSTD_CCtxParam_setParameter(&cctxParams, ZSTD_p_compressLiterals, compressionLevel>=0);
     return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, cctxParams);
 }
 
-size_t ZSTD_compressCCtx (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize,
+                         int compressionLevel)
 {
     DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (U32)srcSize);
+    assert(cctx != NULL);
     return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
 }
 
-size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+               const void* src, size_t srcSize,
+                     int compressionLevel)
 {
     size_t result;
     ZSTD_CCtx ctxBody;
-    memset(&ctxBody, 0, sizeof(ctxBody));
-    ctxBody.customMem = ZSTD_defaultCMem;
+    ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
     result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
-    ZSTD_free(ctxBody.workSpace, ZSTD_defaultCMem);  /* can't free ctxBody itself, as it's on stack; free only heap content */
+    ZSTD_freeCCtxContent(&ctxBody);   /* can't free ctxBody itself, as it's on stack; free only heap content */
     return result;
 }
 
@@ -2619,9 +3214,9 @@ static size_t ZSTD_initCDict_internal(
                     ZSTD_dictContentType_e dictContentType,
                     ZSTD_compressionParameters cParams)
 {
-    DEBUGLOG(3, "ZSTD_initCDict_internal, dictContentType %u", (U32)dictContentType);
+    DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (U32)dictContentType);
     assert(!ZSTD_checkCParams(cParams));
-    cdict->cParams = cParams;
+    cdict->matchState.cParams = cParams;
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
         cdict->dictBuffer = NULL;
         cdict->dictContent = dictBuffer;
@@ -2654,7 +3249,7 @@ static size_t ZSTD_initCDict_internal(
         {   size_t const dictID = ZSTD_compress_insertDictionary(
                     &cdict->cBlockState, &cdict->matchState, &params,
                     cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, cdict->workspace);
+                    dictContentType, ZSTD_dtlm_full, cdict->workspace);
             if (ZSTD_isError(dictID)) return dictID;
             assert(dictID <= (size_t)(U32)-1);
             cdict->dictID = (U32)dictID;
@@ -2775,7 +3370,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
 ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
 {
     assert(cdict != NULL);
-    return cdict->cParams;
+    return cdict->matchState.cParams;
 }
 
 /* ZSTD_compressBegin_usingCDict_advanced() :
@@ -2799,7 +3394,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
         }
         params.fParams = fParams;
         return ZSTD_compressBegin_internal(cctx,
-                                           NULL, 0, ZSTD_dct_auto,
+                                           NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
                                            cdict,
                                            params, pledgedSrcSize,
                                            ZSTDb_not_buffered);
@@ -2813,7 +3408,7 @@ size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
 {
     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
     DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag);
-    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, 0);
+    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
@@ -2880,16 +3475,17 @@ size_t ZSTD_CStreamOutSize(void)
 static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
                     const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
                     const ZSTD_CDict* const cdict,
-                    ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize)
+                    ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTD_resetCStream_internal (disableLiteralCompression=%i)",
-                params.disableLiteralCompression);
+    DEBUGLOG(4, "ZSTD_resetCStream_internal");
+    /* Finalize the compression parameters */
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
     CHECK_F( ZSTD_compressBegin_internal(cctx,
-                                         dict, dictSize, dictContentType,
+                                         dict, dictSize, dictContentType, ZSTD_dtlm_fast,
                                          cdict,
                                          params, pledgedSrcSize,
                                          ZSTDb_buffered) );
@@ -2912,7 +3508,6 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
     DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize);
     if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
     params.fParams.contentSizeFlag = 1;
-    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, 0);
     return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize);
 }
 
@@ -2925,6 +3520,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
                     ZSTD_CCtx_params params, unsigned long long pledgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_initCStream_internal");
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
 
@@ -2991,25 +3587,21 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                 (U32)pledgedSrcSize, params.fParams.contentSizeFlag);
     CHECK_F( ZSTD_checkCParams(params.cParams) );
     if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */
-    {   ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-        return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize);
-    }
+    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    ZSTD_CCtx_params const cctxParams =
-            ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, zcs->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
 {
     U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;  /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
-    ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params);
-    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize);
+    ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, zcs->requestedParams, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
@@ -3073,7 +3665,7 @@ size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 ip = iend;
                 op += cSize;
                 zcs->frameEnded = 1;
-                ZSTD_startNewCompression(zcs);
+                ZSTD_CCtx_reset(zcs);
                 someMoreWork = 0; break;
             }
             /* complete loading into inBuffer */
@@ -3126,7 +3718,7 @@ size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     if (zcs->frameEnded) {
                         DEBUGLOG(5, "Frame completed directly in outBuffer");
                         someMoreWork = 0;
-                        ZSTD_startNewCompression(zcs);
+                        ZSTD_CCtx_reset(zcs);
                     }
                     break;
                 }
@@ -3154,7 +3746,7 @@ size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 if (zcs->frameEnded) {
                     DEBUGLOG(5, "Frame completed on flush");
                     someMoreWork = 0;
-                    ZSTD_startNewCompression(zcs);
+                    ZSTD_CCtx_reset(zcs);
                     break;
                 }
                 zcs->streamStage = zcss_load;
@@ -3207,19 +3799,16 @@ size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
         params.cParams = ZSTD_getCParamsFromCCtxParams(
                 &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
 
+
 #ifdef ZSTD_MULTITHREAD
         if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
             params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
         }
         if (params.nbWorkers > 0) {
             /* mt context creation */
-            if (cctx->mtctx == NULL || (params.nbWorkers != ZSTDMT_getNbWorkers(cctx->mtctx))) {
+            if (cctx->mtctx == NULL) {
                 DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbWorkers=%u",
                             params.nbWorkers);
-                if (cctx->mtctx != NULL)
-                    DEBUGLOG(4, "ZSTD_compress_generic: previous nbWorkers was %u",
-                                ZSTDMT_getNbWorkers(cctx->mtctx));
-                ZSTDMT_freeCCtx(cctx->mtctx);
                 cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbWorkers, cctx->customMem);
                 if (cctx->mtctx == NULL) return ERROR(memory_allocation);
             }
@@ -3251,8 +3840,9 @@ size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
         {   size_t const flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
             if ( ZSTD_isError(flushMin)
               || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
-                ZSTD_startNewCompression(cctx);
+                ZSTD_CCtx_reset(cctx);
             }
+            DEBUGLOG(5, "completed ZSTD_compress_generic delegating to ZSTDMT_compressStream_generic");
             return flushMin;
     }   }
 #endif
@@ -3308,82 +3898,83 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 
 #define ZSTD_MAX_CLEVEL     22
 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
 
 static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
 {   /* "default" - guarantees a monotonically increasing memory budget */
     /* W,  C,  H,  S,  L, TL, strat */
     { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
-    { 19, 13, 14,  1,  7,  1, ZSTD_fast    },  /* level  1 */
-    { 19, 15, 16,  1,  6,  1, ZSTD_fast    },  /* level  2 */
-    { 20, 16, 17,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
-    { 20, 17, 18,  1,  5,  8, ZSTD_dfast   },  /* level  4 */
-    { 20, 17, 18,  2,  5, 16, ZSTD_greedy  },  /* level  5 */
-    { 21, 17, 19,  2,  5, 16, ZSTD_lazy    },  /* level  6 */
-    { 21, 18, 19,  3,  5, 16, ZSTD_lazy    },  /* level  7 */
-    { 21, 18, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
-    { 21, 19, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  9 */
-    { 21, 19, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
-    { 22, 20, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 19, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 20, 16, 17,  1,  5,  1, ZSTD_dfast   },  /* level  3 */
+    { 20, 18, 18,  1,  5,  1, ZSTD_dfast   },  /* level  4 */
+    { 20, 18, 18,  2,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  2,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 18, 19,  3,  5,  8, ZSTD_lazy2   },  /* level  7 */
+    { 21, 19, 19,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 21, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 21, 21, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
     { 22, 20, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
     { 22, 21, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
     { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
     { 22, 22, 22,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
     { 22, 21, 22,  4,  5, 48, ZSTD_btopt   },  /* level 16 */
-    { 23, 22, 22,  4,  4, 48, ZSTD_btopt   },  /* level 17 */
-    { 23, 22, 22,  5,  3, 64, ZSTD_btopt   },  /* level 18 */
-    { 23, 23, 22,  7,  3,128, ZSTD_btopt   },  /* level 19 */
-    { 25, 25, 23,  7,  3,128, ZSTD_btultra },  /* level 20 */
-    { 26, 26, 24,  7,  3,256, ZSTD_btultra },  /* level 21 */
-    { 27, 27, 25,  9,  3,512, ZSTD_btultra },  /* level 22 */
+    { 23, 22, 22,  4,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3,256, ZSTD_btopt   },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra },  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra },  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra },  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra },  /* level 22 */
 },
 {   /* for srcSize <= 256 KB */
     /* W,  C,  H,  S,  L,  T, strat */
     { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
-    { 18, 13, 14,  1,  6,  1, ZSTD_fast    },  /* level  1 */
-    { 18, 14, 13,  1,  5,  8, ZSTD_dfast   },  /* level  2 */
-    { 18, 16, 15,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
-    { 18, 15, 17,  1,  5,  8, ZSTD_greedy  },  /* level  4.*/
-    { 18, 16, 17,  4,  5,  8, ZSTD_greedy  },  /* level  5.*/
-    { 18, 16, 17,  3,  5,  8, ZSTD_lazy    },  /* level  6.*/
-    { 18, 17, 17,  4,  4,  8, ZSTD_lazy    },  /* level  7 */
-    { 18, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
-    { 18, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
-    { 18, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
-    { 18, 18, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 11.*/
-    { 18, 18, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 12.*/
-    { 18, 19, 17,  7,  4,  8, ZSTD_btlazy2 },  /* level 13 */
-    { 18, 18, 18,  4,  4, 16, ZSTD_btopt   },  /* level 14.*/
-    { 18, 18, 18,  4,  3, 16, ZSTD_btopt   },  /* level 15.*/
-    { 18, 19, 18,  6,  3, 32, ZSTD_btopt   },  /* level 16.*/
-    { 18, 19, 18,  8,  3, 64, ZSTD_btopt   },  /* level 17.*/
-    { 18, 19, 18,  9,  3,128, ZSTD_btopt   },  /* level 18.*/
-    { 18, 19, 18, 10,  3,256, ZSTD_btopt   },  /* level 19.*/
-    { 18, 19, 18, 11,  3,512, ZSTD_btultra },  /* level 20.*/
-    { 18, 19, 18, 12,  3,512, ZSTD_btultra },  /* level 21.*/
-    { 18, 19, 18, 13,  3,512, ZSTD_btultra },  /* level 22.*/
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  1, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  1, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  2,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 18, 18,  3,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 16, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  6,  4, 16, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 19, 19,  8,  4, 16, ZSTD_btlazy2 },  /* level 13 */
+    { 18, 18, 19,  4,  4, 24, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  4,  3, 24, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3, 64, ZSTD_btopt   },  /* level 16.*/
+    { 18, 19, 19,  8,  3,128, ZSTD_btopt   },  /* level 17.*/
+    { 18, 19, 19, 10,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 18, 19, 19, 10,  3,256, ZSTD_btultra },  /* level 19.*/
+    { 18, 19, 19, 11,  3,512, ZSTD_btultra },  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra },  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra },  /* level 22.*/
 },
 {   /* for srcSize <= 128 KB */
     /* W,  C,  H,  S,  L,  T, strat */
-    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* level  0 - not used */
-    { 17, 12, 13,  1,  6,  1, ZSTD_fast    },  /* level  1 */
-    { 17, 13, 16,  1,  5,  1, ZSTD_fast    },  /* level  2 */
-    { 17, 16, 16,  2,  5,  8, ZSTD_dfast   },  /* level  3 */
-    { 17, 13, 15,  3,  4,  8, ZSTD_greedy  },  /* level  4 */
-    { 17, 15, 17,  4,  4,  8, ZSTD_greedy  },  /* level  5 */
-    { 17, 16, 17,  3,  4,  8, ZSTD_lazy    },  /* level  6 */
-    { 17, 15, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  1, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  1, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 17, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
     { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
     { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
     { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
     { 17, 17, 17,  7,  4,  8, ZSTD_lazy2   },  /* level 11 */
-    { 17, 17, 17,  8,  4,  8, ZSTD_lazy2   },  /* level 12 */
-    { 17, 18, 17,  6,  4,  8, ZSTD_btlazy2 },  /* level 13.*/
-    { 17, 17, 17,  7,  3,  8, ZSTD_btopt   },  /* level 14.*/
-    { 17, 17, 17,  7,  3, 16, ZSTD_btopt   },  /* level 15.*/
-    { 17, 18, 17,  7,  3, 32, ZSTD_btopt   },  /* level 16.*/
-    { 17, 18, 17,  7,  3, 64, ZSTD_btopt   },  /* level 17.*/
-    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 18.*/
-    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 17, 18, 17,  6,  4, 16, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  8,  4, 16, ZSTD_btlazy2 },  /* level 13.*/
+    { 17, 18, 17,  4,  4, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  7,  3,128, ZSTD_btopt   },  /* level 16.*/
+    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 17.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 19.*/
     { 17, 18, 17,  9,  3,256, ZSTD_btultra },  /* level 20.*/
     { 17, 18, 17, 10,  3,256, ZSTD_btultra },  /* level 21.*/
     { 17, 18, 17, 11,  3,512, ZSTD_btultra },  /* level 22.*/
@@ -3391,28 +3982,28 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV
 {   /* for srcSize <= 16 KB */
     /* W,  C,  H,  S,  L,  T, strat */
     { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
-    { 14, 14, 14,  1,  6,  1, ZSTD_fast    },  /* level  1 */
-    { 14, 14, 14,  1,  4,  1, ZSTD_fast    },  /* level  2 */
-    { 14, 14, 14,  1,  4,  6, ZSTD_dfast   },  /* level  3.*/
-    { 14, 14, 14,  4,  4,  6, ZSTD_greedy  },  /* level  4.*/
-    { 14, 14, 14,  3,  4,  6, ZSTD_lazy    },  /* level  5.*/
-    { 14, 14, 14,  4,  4,  6, ZSTD_lazy2   },  /* level  6 */
-    { 14, 14, 14,  5,  4,  6, ZSTD_lazy2   },  /* level  7 */
-    { 14, 14, 14,  6,  4,  6, ZSTD_lazy2   },  /* level  8.*/
-    { 14, 15, 14,  6,  4,  6, ZSTD_btlazy2 },  /* level  9.*/
-    { 14, 15, 14,  3,  3,  6, ZSTD_btopt   },  /* level 10.*/
-    { 14, 15, 14,  6,  3,  8, ZSTD_btopt   },  /* level 11.*/
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 14,  2,  4,  1, ZSTD_dfast   },  /* level  3.*/
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4.*/
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
     { 14, 15, 14,  6,  3, 16, ZSTD_btopt   },  /* level 12.*/
     { 14, 15, 14,  6,  3, 24, ZSTD_btopt   },  /* level 13.*/
     { 14, 15, 15,  6,  3, 48, ZSTD_btopt   },  /* level 14.*/
     { 14, 15, 15,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
     { 14, 15, 15,  6,  3, 96, ZSTD_btopt   },  /* level 16.*/
     { 14, 15, 15,  6,  3,128, ZSTD_btopt   },  /* level 17.*/
-    { 14, 15, 15,  6,  3,256, ZSTD_btopt   },  /* level 18.*/
-    { 14, 15, 15,  7,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 14, 15, 15,  6,  3,256, ZSTD_btultra },  /* level 19.*/
     { 14, 15, 15,  8,  3,256, ZSTD_btultra },  /* level 20.*/
     { 14, 15, 15,  9,  3,256, ZSTD_btultra },  /* level 21.*/
-    { 14, 15, 15, 10,  3,256, ZSTD_btultra },  /* level 22.*/
+    { 14, 15, 15, 10,  3,512, ZSTD_btultra },  /* level 22.*/
 },
 };
 
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index 81f12ca6df5e..43f7c1486a9c 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -27,6 +27,7 @@
 extern "C" {
 #endif
 
+
 /*-*************************************
 *  Constants
 ***************************************/
@@ -37,7 +38,8 @@ extern "C" {
                                        It's not a big deal though : candidate will just be sorted again.
                                        Additionnally, candidate position 1 will be lost.
                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy
+                                       Constant required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
 
 
 /*-*************************************
@@ -46,6 +48,12 @@ extern "C" {
 typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
 typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
 
+typedef enum {
+    ZSTD_dictDefaultAttach = 0,
+    ZSTD_dictForceAttach = 1,
+    ZSTD_dictForceCopy = -1,
+} ZSTD_dictAttachPref_e;
+
 typedef struct ZSTD_prefixDict_s {
     const void* dict;
     size_t dictSize;
@@ -53,14 +61,22 @@ typedef struct ZSTD_prefixDict_s {
 } ZSTD_prefixDict;
 
 typedef struct {
-    U32 hufCTable[HUF_CTABLE_SIZE_U32(255)];
+    U32 CTable[HUF_CTABLE_SIZE_U32(255)];
+    HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
     FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
     FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
     FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-    HUF_repeat hufCTable_repeatMode;
     FSE_repeat offcode_repeatMode;
     FSE_repeat matchlength_repeatMode;
     FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+    ZSTD_hufCTables_t huf;
+    ZSTD_fseCTables_t fse;
 } ZSTD_entropyCTables_t;
 
 typedef struct {
@@ -76,26 +92,27 @@ typedef struct {
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_optimal_t;
 
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
 typedef struct {
     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
-    U32* litFreq;               /* table of literals statistics, of size 256 */
-    U32* litLengthFreq;         /* table of litLength statistics, of size (MaxLL+1) */
-    U32* matchLengthFreq;       /* table of matchLength statistics, of size (MaxML+1) */
-    U32* offCodeFreq;           /* table of offCode statistics, of size (MaxOff+1) */
-    ZSTD_match_t* matchTable;   /* list of found matches, of size ZSTD_OPT_NUM+1 */
-    ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+    U32* litFreq;                /* table of literals statistics, of size 256 */
+    U32* litLengthFreq;          /* table of litLength statistics, of size (MaxLL+1) */
+    U32* matchLengthFreq;        /* table of matchLength statistics, of size (MaxML+1) */
+    U32* offCodeFreq;            /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
 
     U32  litSum;                 /* nb of literals */
     U32  litLengthSum;           /* nb of litLength codes */
     U32  matchLengthSum;         /* nb of matchLength codes */
     U32  offCodeSum;             /* nb of offset codes */
-    /* begin updated by ZSTD_setLog2Prices */
-    U32  log2litSum;             /* pow2 to compare log2(litfreq) to */
-    U32  log2litLengthSum;       /* pow2 to compare log2(llfreq) to */
-    U32  log2matchLengthSum;     /* pow2 to compare log2(mlfreq) to */
-    U32  log2offCodeSum;         /* pow2 to compare log2(offreq) to */
-    /* end : updated by ZSTD_setLog2Prices */
-    U32  staticPrices;           /* prices follow a pre-defined cost structure, statistics are irrelevant */
+    U32  litSumBasePrice;        /* to compare to log2(litfreq) */
+    U32  litLengthSumBasePrice;  /* to compare to log2(llfreq)  */
+    U32  matchLengthSumBasePrice;/* to compare to log2(mlfreq)  */
+    U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+    ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+    const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
 } optState_t;
 
 typedef struct {
@@ -111,17 +128,20 @@ typedef struct {
     U32 lowLimit;           /* below that point, no more data */
 } ZSTD_window_t;
 
-typedef struct {
-    ZSTD_window_t window;      /* State for window round buffer management */
-    U32 loadedDictEnd;         /* index of end of dictionary */
-    U32 nextToUpdate;          /* index from which to continue table update */
-    U32 nextToUpdate3;         /* index from which to continue table update */
-    U32 hashLog3;              /* dispatch table : larger == faster, more memory */
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+struct ZSTD_matchState_t {
+    ZSTD_window_t window;   /* State for window round buffer management */
+    U32 loadedDictEnd;      /* index of end of dictionary */
+    U32 nextToUpdate;       /* index from which to continue table update */
+    U32 nextToUpdate3;      /* index from which to continue table update */
+    U32 hashLog3;           /* dispatch table : larger == faster, more memory */
     U32* hashTable;
     U32* hashTable3;
     U32* chainTable;
     optState_t opt;         /* optimal parser state */
-} ZSTD_matchState_t;
+    const ZSTD_matchState_t *dictMatchState;
+    ZSTD_compressionParameters cParams;
+};
 
 typedef struct {
     ZSTD_compressedBlockState_t* prevCBlock;
@@ -161,7 +181,7 @@ typedef struct {
   rawSeq* seq;     /* The start of the sequences */
   size_t pos;      /* The position where reading stopped. <= size. */
   size_t size;     /* The number of sequences. <= capacity. */
-  size_t capacity; /* The capacity of the `seq` pointer */
+  size_t capacity; /* The capacity starting from `seq` pointer */
 } rawSeqStore_t;
 
 struct ZSTD_CCtx_params_s {
@@ -170,10 +190,11 @@ struct ZSTD_CCtx_params_s {
     ZSTD_frameParameters fParams;
 
     int compressionLevel;
-    int disableLiteralCompression;
     int forceWindow;           /* force back-references to respect limit of
                                 * 1<<wLog, even for dictionary */
 
+    ZSTD_dictAttachPref_e attachDictPref;
+
     /* Multithreading: used to pass parameters to mtctx */
     unsigned nbWorkers;
     unsigned jobSize;
@@ -193,6 +214,8 @@ struct ZSTD_CCtx_s {
     ZSTD_CCtx_params requestedParams;
     ZSTD_CCtx_params appliedParams;
     U32   dictID;
+
+    int workSpaceOversizedDuration;
     void* workSpace;
     size_t workSpaceSize;
     size_t blockSize;
@@ -235,11 +258,15 @@ struct ZSTD_CCtx_s {
 #endif
 };
 
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+
+typedef enum { ZSTD_noDict = 0, ZSTD_extDict = 1, ZSTD_dictMatchState = 2 } ZSTD_dictMode_e;
+
 
 typedef size_t (*ZSTD_blockCompressor) (
         ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict);
+        void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
 
 
 MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
@@ -280,16 +307,18 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
 */
 MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t mlBase)
 {
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG >= 6)
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%3u bytes at dist.code%7u",
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
                pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offsetCode);
     }
 #endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
     /* copy Literals */
-    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + 128 KB);
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
     ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
     seqStorePtr->lit += litLength;
 
@@ -420,6 +449,11 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
     const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
     size_t const matchLength = ZSTD_count(ip, match, vEnd);
     if (match + matchLength != mEnd) return matchLength;
+    DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+    DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+    DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
     return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
 }
 
@@ -496,6 +530,20 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
     return window.lowLimit < window.dictLimit;
 }
 
+/**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+    return ZSTD_window_hasExtDict(ms->window) ?
+        ZSTD_extDict :
+        ms->dictMatchState != NULL ?
+            ZSTD_dictMatchState :
+            ZSTD_noDict;
+}
+
 /**
  * ZSTD_window_needOverflowCorrection():
  * Returns non-zero if the indices are getting too large and need overflow
@@ -563,31 +611,41 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
  * ZSTD_window_enforceMaxDist():
  * Updates lowLimit so that:
  *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
  * This allows a simple check that index >= lowLimit to see if index is valid.
  * This must be called before a block compression call, with srcEnd as the block
  * source end.
+ *
  * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit.
  * This is because dictionaries are allowed to be referenced as long as the last
  * byte of the dictionary is in the window, but once they are out of range,
  * they cannot be referenced. If loadedDictEndPtr is NULL, we use
  * loadedDictEnd == 0.
+ *
+ * In normal dict mode, the dict is between lowLimit and dictLimit. In
+ * dictMatchState mode, lowLimit and dictLimit are the same, and the dictionary
+ * is below them. forceWindow and dictMatchState are therefore incompatible.
  */
 MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
                                            void const* srcEnd, U32 maxDist,
-                                           U32* loadedDictEndPtr)
+                                           U32* loadedDictEndPtr,
+                                           const ZSTD_matchState_t** dictMatchStatePtr)
 {
     U32 const current = (U32)((BYTE const*)srcEnd - window->base);
     U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0;
+    DEBUGLOG(5, "ZSTD_window_enforceMaxDist: current=%u, maxDist=%u", current, maxDist);
     if (current > maxDist + loadedDictEnd) {
         U32 const newLowLimit = current - maxDist;
         if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
         if (window->dictLimit < window->lowLimit) {
-            DEBUGLOG(5, "Update dictLimit from %u to %u", window->dictLimit,
-                     window->lowLimit);
+            DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+                        window->dictLimit, window->lowLimit);
             window->dictLimit = window->lowLimit;
         }
         if (loadedDictEndPtr)
             *loadedDictEndPtr = 0;
+        if (dictMatchStatePtr)
+            *dictMatchStatePtr = NULL;
     }
 }
 
@@ -603,12 +661,12 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
 {
     BYTE const* const ip = (BYTE const*)src;
     U32 contiguous = 1;
+    DEBUGLOG(5, "ZSTD_window_update");
     /* Check if blocks follow each other */
     if (src != window->nextSrc) {
         /* not contiguous */
         size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
-        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u",
-                 window->dictLimit);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
         window->lowLimit = window->dictLimit;
         assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
         window->dictLimit = (U32)distanceFromBase;
@@ -625,10 +683,38 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
         ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
         U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
         window->lowLimit = lowLimitMax;
+        DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
     }
     return contiguous;
 }
 
+
+/* debug functions */
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+    U32 const fp_accuracy = 8;
+    U32 const fp_multiplier = (1 << fp_accuracy);
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * fp_multiplier;
+    U32 const FWeight = (stat << fp_accuracy) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + fp_accuracy < 31);
+    return (double)weight / fp_multiplier;
+}
+
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+    unsigned u, sum;
+    for (u=0, sum=0; u<=max; u++) sum += table[u];
+    DEBUGLOG(2, "total nb elts: %u", sum);
+    for (u=0; u<=max; u++) {
+        DEBUGLOG(2, "%2u: %5u  (%.2f)",
+                u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+    }
+}
+
 #if defined (__cplusplus)
 }
 #endif
@@ -640,7 +726,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
  * ============================================================== */
 
 /* ZSTD_getCParamsFromCCtxParams() :
- * cParams are built depending on compressionLevel, src size hints, 
+ * cParams are built depending on compressionLevel, src size hints,
  * LDM and manually set compression parameters.
  */
 ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
@@ -656,6 +742,8 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
                      const ZSTD_CDict* cdict,
                      ZSTD_CCtx_params  params, unsigned long long pledgedSrcSize);
 
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
 /*! ZSTD_compressStream_generic() :
  *  Private use only. To be called from zstdmt_compress.c in single-thread mode. */
 size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
@@ -672,6 +760,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
 size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
                                     ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
                                     const ZSTD_CDict* cdict,
                                     ZSTD_CCtx_params params,
                                     unsigned long long pledgedSrcSize);
diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c
index 86e6b39621b2..7b9e18e7ed9d 100644
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@@ -13,9 +13,9 @@
 
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              ZSTD_compressionParameters const* cParams,
-                              void const* end)
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashLarge = ms->hashTable;
     U32  const hBitsL = cParams->hashLog;
     U32  const mls = cParams->searchLength;
@@ -40,6 +40,9 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
                 hashSmall[smHash] = current + i;
             if (i == 0 || hashLarge[lgHash] == 0)
                 hashLarge[lgHash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
         }
     }
 }
@@ -48,9 +51,10 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_doubleFast_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
-        U32 const mls /* template */)
+        void const* src, size_t srcSize,
+        U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
 {
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32* const hashLong = ms->hashTable;
     const U32 hBitsL = cParams->hashLog;
     U32* const hashSmall = ms->chainTable;
@@ -59,70 +63,188 @@ size_t ZSTD_compressBlock_doubleFast_generic(
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32 lowestIndex = ms->window.dictLimit;
-    const BYTE* const lowest = base + lowestIndex;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
     U32 offsetSaved = 0;
 
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
+    const U32* const dictHashLong  = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
+                                     dms->chainTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictHBitsL           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hBitsL;
+    const U32 dictHBitsS           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->chainLog : hBitsS;
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixLowest + dictEnd - dictStart);
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
     /* init */
-    ip += (ip==lowest);
-    {   U32 const maxRep = (U32)(ip-lowest);
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixLowest);
         if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
     }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
 
     /* Main Search Loop */
     while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
         size_t mLength;
+        U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
         U32 const current = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
-        U32 const matchIndexS = hashSmall[h];
+        U32 matchIndexS = hashSmall[h];
         const BYTE* matchLong = base + matchIndexL;
         const BYTE* match = base + matchIndexS;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
         hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
 
-        assert(offset_1 <= current);   /* supposed guaranteed by construction */
-        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
-            /* favor repcode */
+        /* check dictMatchState repcode */
+        if (dictMode == ZSTD_dictMatchState
+            && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+            goto _match_stored;
+        }
+
+        /* check noDict repcode */
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
             mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
-        } else {
-            U32 offset;
-            if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
+            goto _match_stored;
+        }
+
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
                 mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
                 offset = (U32)(ip-matchLong);
-                while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
-            } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
-                size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-                U32 const matchIndexL3 = hashLong[hl3];
-                const BYTE* matchL3 = base + matchIndexL3;
-                hashLong[hl3] = current + 1;
-                if ( (matchIndexL3 > lowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) {
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(current - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+        continue;
+
+_search_next_long:
+
+        {
+            size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = current + 1;
+
+            /* check prefix long +1 match */
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
                     mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
                     ip++;
                     offset = (U32)(ip-matchL3);
-                    while (((ip>anchor) & (matchL3>lowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
-                } else {
-                    mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-                    offset = (U32)(ip-match);
-                    while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            } else if (dictMode == ZSTD_dictMatchState) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
                 }
-            } else {
-                ip += ((ip-anchor) >> kSearchStrength) + 1;
-                continue;
             }
-
-            offset_2 = offset_1;
-            offset_1 = offset;
-
-            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
         }
 
+        /* if no long +1 match, explore the short match we found */
+        if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(current - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+        /* fall-through */
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+_match_stored:
         /* match found */
         ip += mLength;
         anchor = ip;
@@ -135,19 +257,44 @@ size_t ZSTD_compressBlock_doubleFast_generic(
                 hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
 
             /* check immediate repcode */
-            while ( (ip <= ilimit)
-                 && ( (offset_2>0)
-                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
-                /* store sequence */
-                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
-                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
-                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
-                ip += rLength;
-                anchor = ip;
-                continue;   /* faster when present ... (?) */
-    }   }   }
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
+                        && repIndex2 < prefixLowestIndex ?
+                            dictBase - dictIndexDelta + repIndex2 :
+                            base + repIndex2;
+                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                        hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+                }
+            }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+    }   }   }   }
 
     /* save reps for next block */
     rep[0] = offset_1 ? offset_1 : offsetSaved;
@@ -160,102 +307,126 @@ size_t ZSTD_compressBlock_doubleFast_generic(
 
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    const U32 mls = cParams->searchLength;
+    const U32 mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.searchLength;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        void const* src, size_t srcSize,
         U32 const mls /* template */)
 {
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32* const hashLong = ms->hashTable;
     U32  const hBitsL = cParams->hashLog;
     U32* const hashSmall = ms->chainTable;
     U32  const hBitsS = cParams->chainLog;
-    const BYTE* const base = ms->window.base;
-    const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32   lowestIndex = ms->window.lowLimit;
-    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ms->window.dictLimit;
-    const BYTE* const lowPrefixPtr = base + dictLimit;
-    const BYTE* const dictEnd = dictBase + dictLimit;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const U32   dictStartIndex = ms->window.lowLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
     U32 offset_1=rep[0], offset_2=rep[1];
 
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
         const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
         const U32 matchIndex = hashSmall[hSmall];
-        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
         const BYTE* match = matchBase + matchIndex;
 
         const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
         const U32 matchLongIndex = hashLong[hLong];
-        const BYTE* matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
         const BYTE* matchLong = matchLongBase + matchLongIndex;
 
         const U32 current = (U32)(ip-base);
         const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
-        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
-        const BYTE* repMatch = repBase + repIndex;
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
         size_t mLength;
         hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
 
-        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (repIndex > dictStartIndex))
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
         } else {
-            if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
-                const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
-                const BYTE* lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
                 U32 offset;
-                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, lowPrefixPtr) + 8;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
                 offset = current - matchLongIndex;
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
                 ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
 
-            } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
                 U32 const matchIndex3 = hashLong[h3];
-                const BYTE* const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
                 const BYTE* match3 = match3Base + matchIndex3;
                 U32 offset;
                 hashLong[h3] = current + 1;
-                if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
-                    const BYTE* matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
-                    const BYTE* lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
-                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, lowPrefixPtr) + 8;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
                     ip++;
                     offset = current+1 - matchIndex3;
                     while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
                 } else {
-                    const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
-                    const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
-                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
                     offset = current - matchIndex;
                     while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
                 }
@@ -282,12 +453,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
             while (ip <= ilimit) {
                 U32 const current2 = (U32)(ip-base);
                 U32 const repIndex2 = current2 - offset_2;
-                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
-                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (repIndex2 > dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
@@ -309,19 +481,19 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
 
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const mls = cParams->searchLength;
+    U32 const mls = ms->cParams.searchLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
     }
 }
diff --git a/lib/compress/zstd_double_fast.h b/lib/compress/zstd_double_fast.h
index 6d80b2774c0e..4fa31acfc0d6 100644
--- a/lib/compress/zstd_double_fast.h
+++ b/lib/compress/zstd_double_fast.h
@@ -19,14 +19,16 @@ extern "C" {
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              ZSTD_compressionParameters const* cParams,
-                              void const* end);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 
 #if defined (__cplusplus)
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index df4d28b3402b..247746517cd5 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -13,9 +13,9 @@
 
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        ZSTD_compressionParameters const* cParams,
-                        void const* end)
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32  const hBits = cParams->hashLog;
     U32  const mls = cParams->searchLength;
@@ -34,6 +34,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
             size_t const hash = ZSTD_hashPtr(ip + i, hBits, mls);
             if (i == 0 || hashTable[hash] == 0)
                 hashTable[hash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
         }
     }
 }
@@ -42,26 +45,65 @@ FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_fast_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const hlog, U32 const stepSize, U32 const mls)
+        U32 const mls, ZSTD_dictMode_e const dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32   lowestIndex = ms->window.dictLimit;
-    const BYTE* const lowest = base + lowestIndex;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
     U32 offsetSaved = 0;
 
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
+    const U32* const dictHashTable = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixStartIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+    const U32 dictHLog             = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hlog;
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
+    /* otherwise, we would get index underflow when translating a dict index
+     * into a local index */
+    assert(dictMode != ZSTD_dictMatchState
+        || prefixStartIndex >= (U32)(dictEnd - dictBase));
+
     /* init */
-    ip += (ip==lowest);
-    {   U32 const maxRep = (U32)(ip-lowest);
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixStart);
         if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
     }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
 
     /* Main Search Loop */
     while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
@@ -70,26 +112,67 @@ size_t ZSTD_compressBlock_fast_generic(
         U32 const current = (U32)(ip-base);
         U32 const matchIndex = hashTable[h];
         const BYTE* match = base + matchIndex;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixStartIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
         hashTable[h] = current;   /* update hash table */
 
-        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+        if ( (dictMode == ZSTD_dictMatchState)
+          && ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else if ( dictMode == ZSTD_noDict
+                 && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
             mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
-        } else {
-            if ( (matchIndex <= lowestIndex)
-              || (MEM_read32(match) != MEM_read32(ip)) ) {
+        } else if ( (matchIndex <= prefixStartIndex) ) {
+            if (dictMode == ZSTD_dictMatchState) {
+                size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+                U32 const dictMatchIndex = dictHashTable[dictHash];
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex <= dictStartIndex ||
+                    MEM_read32(dictMatch) != MEM_read32(ip)) {
+                    assert(stepSize >= 1);
+                    ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                    continue;
+                } else {
+                    /* found a dict match */
+                    U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta);
+                    mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+                    while (((ip>anchor) & (dictMatch>dictStart))
+                         && (ip[-1] == dictMatch[-1])) {
+                        ip--; dictMatch--; mLength++;
+                    } /* catch up */
+                    offset_2 = offset_1;
+                    offset_1 = offset;
+                    ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                }
+            } else {
                 assert(stepSize >= 1);
                 ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                 continue;
             }
+        } else if (MEM_read32(match) != MEM_read32(ip)) {
+            /* it's not a match, and we're not going to check the dictionary */
+            assert(stepSize >= 1);
+            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+            continue;
+        } else {
+            /* found a regular match */
+            U32 const offset = (U32)(ip-match);
             mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            {   U32 const offset = (U32)(ip-match);
-                while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-                offset_2 = offset_1;
-                offset_1 = offset;
-                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
-        }   }
+            while (((ip>anchor) & (match>prefixStart))
+                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+            offset_2 = offset_1;
+            offset_1 = offset;
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
 
         /* match found */
         ip += mLength;
@@ -97,21 +180,46 @@ size_t ZSTD_compressBlock_fast_generic(
 
         if (ip <= ilimit) {
             /* Fill Table */
+            assert(base+current+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
             hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+
             /* check immediate repcode */
-            while ( (ip <= ilimit)
-                 && ( (offset_2>0)
-                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
-                /* store sequence */
-                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; }  /* swap offset_2 <=> offset_1 */
-                hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
-                ip += rLength;
-                anchor = ip;
-                continue;   /* faster when present ... (?) */
-    }   }   }
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                            dictBase - dictIndexDelta + repIndex2 :
+                            base + repIndex2;
+                    if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                        hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+                }
+            }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+    }   }   }   }
 
     /* save reps for next block */
     rep[0] = offset_1 ? offset_1 : offsetSaved;
@@ -124,42 +232,66 @@ size_t ZSTD_compressBlock_fast_generic(
 
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
+    assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
     case 5 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
     case 6 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
     case 7 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+    }
+}
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32 const mls = cParams->searchLength;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
     }
 }
 
 
 static size_t ZSTD_compressBlock_fast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        void const* src, size_t srcSize,
-        U32 const hlog, U32 const stepSize, U32 const mls)
+        void const* src, size_t srcSize, U32 const mls)
 {
-    U32* hashTable = ms->hashTable;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
-    const U32   lowestIndex = ms->window.lowLimit;
-    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ms->window.dictLimit;
-    const BYTE* const lowPrefixPtr = base + dictLimit;
-    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32   dictStartIndex = ms->window.lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
@@ -167,33 +299,34 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
         const size_t h = ZSTD_hashPtr(ip, hlog, mls);
-        const U32 matchIndex = hashTable[h];
-        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
-        const BYTE* match = matchBase + matchIndex;
-        const U32 current = (U32)(ip-base);
-        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
-        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
-        const BYTE* repMatch = repBase + repIndex;
+        const U32    matchIndex = hashTable[h];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE*  match = matchBase + matchIndex;
+        const U32    current = (U32)(ip-base);
+        const U32    repIndex = current + 1 - offset_1;
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
         size_t mLength;
         hashTable[h] = current;   /* update hash table */
+        assert(offset_1 <= current +1);   /* check repIndex */
 
-        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
+        if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
             ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
         } else {
-            if ( (matchIndex < lowestIndex) ||
+            if ( (matchIndex < dictStartIndex) ||
                  (MEM_read32(match) != MEM_read32(ip)) ) {
                 assert(stepSize >= 1);
                 ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                 continue;
             }
-            {   const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
-                const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+            {   const BYTE* matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
                 U32 offset;
-                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
                 while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
                 offset = current - matchIndex;
                 offset_2 = offset_1;
@@ -213,11 +346,11 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
             while (ip <= ilimit) {
                 U32 const current2 = (U32)(ip-base);
                 U32 const repIndex2 = current2 - offset_2;
-                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex))  /* intentional overflow */
                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
                     hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
@@ -239,21 +372,20 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
 
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    U32 const hlog = cParams->hashLog;
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
     U32 const mls = cParams->searchLength;
-    U32 const stepSize = cParams->targetLength;
     switch(mls)
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
     case 5 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
     case 6 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
     case 7 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
     }
 }
diff --git a/lib/compress/zstd_fast.h b/lib/compress/zstd_fast.h
index f0438ad5b411..b74a88c57c81 100644
--- a/lib/compress/zstd_fast.h
+++ b/lib/compress/zstd_fast.h
@@ -19,14 +19,16 @@ extern "C" {
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        ZSTD_compressionParameters const* cParams,
-                        void const* end);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 9f158123f077..af615e07763d 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -16,11 +16,12 @@
 *  Binary Tree search
 ***************************************/
 
-void ZSTD_updateDUBT(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static void
+ZSTD_updateDUBT(ZSTD_matchState_t* ms,
                 const BYTE* ip, const BYTE* iend,
                 U32 mls)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const hashTable = ms->hashTable;
     U32  const hashLog = cParams->hashLog;
 
@@ -59,11 +60,12 @@ void ZSTD_updateDUBT(
  *  sort one already inserted but unsorted position
  *  assumption : current >= btlow == (current - btmask)
  *  doesn't fail */
-static void ZSTD_insertDUBT1(
-                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static void
+ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
                  U32 current, const BYTE* inputEnd,
-                 U32 nbCompares, U32 btLow, int extDict)
+                 U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const bt = ms->chainTable;
     U32    const btLog  = cParams->chainLog - 1;
     U32    const btMask = (1 << btLog) - 1;
@@ -92,10 +94,12 @@ static void ZSTD_insertDUBT1(
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         assert(matchIndex < current);
 
-        if ( (!extDict)
+        if ( (dictMode != ZSTD_extDict)
           || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
           || (current < dictLimit) /* both in extDict */) {
-            const BYTE* const mBase = !extDict || ((matchIndex+matchLength) >= dictLimit) ? base : dictBase;
+            const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+                                     || (matchIndex+matchLength >= dictLimit)) ?
+                                        base : dictBase;
             assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
                  || (current < dictLimit) );
             match = mBase + matchIndex;
@@ -138,13 +142,92 @@ static void ZSTD_insertDUBT1(
 }
 
 
-static size_t ZSTD_DUBT_findBestMatch (
-                            ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                            const BYTE* const ip, const BYTE* const iend,
-                            size_t* offsetPtr,
-                            U32 const mls,
-                            U32 const extDict)
+static size_t
+ZSTD_DUBT_findBetterDictMatch (
+        ZSTD_matchState_t* ms,
+        const BYTE* const ip, const BYTE* const iend,
+        size_t* offsetPtr,
+        size_t bestLength,
+        U32 nbCompares,
+        U32 const mls,
+        const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+    const U32 * const dictHashTable = dms->hashTable;
+    U32         const hashLog = dmsCParams->hashLog;
+    size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32               dictMatchIndex = dictHashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    U32         const current = (U32)(ip-base);
+    const BYTE* const dictBase = dms->window.base;
+    const BYTE* const dictEnd = dms->window.nextSrc;
+    U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+    U32         const dictLowLimit = dms->window.lowLimit;
+    U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+    U32*        const dictBt = dms->chainTable;
+    U32         const btLog  = dmsCParams->chainLog - 1;
+    U32         const btMask = (1 << btLog) - 1;
+    U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+
+    (void)dictMode;
+    assert(dictMode == ZSTD_dictMatchState);
+
+    while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
+        U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match = dictBase + dictMatchIndex;
+        matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+        if (dictMatchIndex+matchLength >= dictHighLimit)
+            match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+        if (matchLength > bestLength) {
+            U32 matchIndex = dictMatchIndex + dictIndexDelta;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                    current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+            }
+            if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthLarger = matchLength;
+            dictMatchIndex = nextPtr[0];
+        }
+    }
+
+    if (bestLength >= MINMATCH) {
+        U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+        DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                    current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+    }
+    return bestLength;
+
+}
+
+
+static size_t
+ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offsetPtr,
+                        U32 const mls,
+                        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const hashTable = ms->hashTable;
     U32    const hashLog = cParams->hashLog;
     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
@@ -195,8 +278,8 @@ static size_t ZSTD_DUBT_findBestMatch (
     while (matchIndex) {  /* will end on matchIndex == 0 */
         U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
         U32 const nextCandidateIdx = *nextCandidateIdxPtr;
-        ZSTD_insertDUBT1(ms, cParams, matchIndex, iend,
-                         nbCandidates, unsortLimit, extDict);
+        ZSTD_insertDUBT1(ms, matchIndex, iend,
+                         nbCandidates, unsortLimit, dictMode);
         matchIndex = nextCandidateIdx;
         nbCandidates++;
     }
@@ -221,7 +304,7 @@ static size_t ZSTD_DUBT_findBestMatch (
             size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
             const BYTE* match;
 
-            if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
                 match = base + matchIndex;
                 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
             } else {
@@ -237,6 +320,11 @@ static size_t ZSTD_DUBT_findBestMatch (
                 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
                     bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    if (dictMode == ZSTD_dictMatchState) {
+                        nbCompares = 0; /* in addition to avoiding checking any
+                                         * further in this loop, make sure we
+                                         * skip checking in the dictionary. */
+                    }
                     break;   /* drop, to guarantee consistency (miss a little bit of compression) */
                 }
             }
@@ -259,6 +347,13 @@ static size_t ZSTD_DUBT_findBestMatch (
 
         *smallerPtr = *largerPtr = 0;
 
+        if (dictMode == ZSTD_dictMatchState && nbCompares) {
+            bestLength = ZSTD_DUBT_findBetterDictMatch(
+                    ms, ip, iend,
+                    offsetPtr, bestLength, nbCompares,
+                    mls, dictMode);
+        }
+
         assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
         if (bestLength >= MINMATCH) {
@@ -272,61 +367,64 @@ static size_t ZSTD_DUBT_findBestMatch (
 
 
 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
-static size_t ZSTD_BtFindBestMatch (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* const ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 mls /* template */)
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iLimit,
+                      size_t* offsetPtr,
+                const U32 mls /* template */,
+                const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 0);
+    ZSTD_updateDUBT(ms, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
 }
 
 
-static size_t ZSTD_BtFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+static size_t
+ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms,
+                            const BYTE* ip, const BYTE* const iLimit,
+                                  size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4);
-    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
     }
 }
 
 
-/** Tree updater, providing best match */
-static size_t ZSTD_BtFindBestMatch_extDict (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* const ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 mls)
-{
-    DEBUGLOG(7, "ZSTD_BtFindBestMatch_extDict");
-    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 1);
-}
-
-
-static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 4);
-    case 5 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 5);
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
     case 7 :
-    case 6 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 6);
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
     }
 }
 
@@ -340,7 +438,8 @@ static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
 /* Update chains up to ip (excluded)
    Assumption : always within prefix (i.e. not within extDict) */
 static U32 ZSTD_insertAndFindFirstIndex_internal(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
                         const BYTE* ip, U32 const mls)
 {
     U32* const hashTable  = ms->hashTable;
@@ -362,22 +461,21 @@ static U32 ZSTD_insertAndFindFirstIndex_internal(
     return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
 }
 
-U32 ZSTD_insertAndFindFirstIndex(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip)
-{
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength);
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.searchLength);
 }
 
 
 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_HcFindBestMatch_generic (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
-                        const U32 mls, const U32 extDict)
+                        const U32 mls, const ZSTD_dictMode_e dictMode)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32* const chainTable = ms->chainTable;
     const U32 chainSize = (1 << cParams->chainLog);
     const U32 chainMask = chainSize-1;
@@ -397,7 +495,7 @@ size_t ZSTD_HcFindBestMatch_generic (
 
     for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
-        if ((!extDict) || matchIndex >= dictLimit) {
+        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             if (match[ml] == ip[ml])   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
@@ -419,38 +517,87 @@ size_t ZSTD_HcFindBestMatch_generic (
         matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
     }
 
+    if (dictMode == ZSTD_dictMatchState) {
+        const ZSTD_matchState_t* const dms = ms->dictMatchState;
+        const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
+        const U32 dmsChainMask         = dmsChainSize - 1;
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+        const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
+
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
+
+        for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+            size_t currentMl=0;
+            const BYTE* const match = dmsBase + matchIndex;
+            assert(match+4 <= dmsEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+
+            if (matchIndex <= dmsMinChain) break;
+            matchIndex = dmsChainTable[matchIndex & dmsChainMask];
+        }
+    }
+
     return ml;
 }
 
 
 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
                         size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 0);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 0);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 0);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    }
+}
+
+
+static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.searchLength)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
     }
 }
 
 
 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* const iLimit,
-                        size_t* const offsetPtr)
+                        size_t* offsetPtr)
 {
-    switch(cParams->searchLength)
+    switch(ms->cParams.searchLength)
     {
     default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 1);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 1);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
     case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 1);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
     }
 }
 
@@ -462,30 +609,55 @@ FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_lazy_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
-                        ZSTD_compressionParameters const* cParams,
                         const void* src, size_t srcSize,
-                        const U32 searchMethod, const U32 depth)
+                        const U32 searchMethod, const U32 depth,
+                        ZSTD_dictMode_e const dictMode)
 {
     const BYTE* const istart = (const BYTE*)src;
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
-    const BYTE* const base = ms->window.base + ms->window.dictLimit;
+    const BYTE* const base = ms->window.base;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
 
     typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-    searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+    searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
+        (searchMethod ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
+        (searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS);
     U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
 
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 dictLowestIndex      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictLowest   = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
+
     /* init */
-    ip += (ip==base);
+    ip += (dictAndPrefixLength == 0);
     ms->nextToUpdate3 = ms->nextToUpdate;
-    {   U32 const maxRep = (U32)(ip-base);
+    if (dictMode == ZSTD_noDict) {
+        U32 const maxRep = (U32)(ip - prefixLowest);
         if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
     }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
 
     /* Match Loop */
     while (ip < ilimit) {
@@ -494,15 +666,28 @@ size_t ZSTD_compressBlock_lazy_generic(
         const BYTE* start=ip+1;
 
         /* check repCode */
-        if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) {
-            /* repcode : we take it */
+        if (dictMode == ZSTD_dictMatchState) {
+            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+            const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                                && repIndex < prefixLowestIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                if (depth==0) goto _storeSequence;
+            }
+        }
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
             matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             if (depth==0) goto _storeSequence;
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
@@ -516,15 +701,31 @@ size_t ZSTD_compressBlock_lazy_generic(
         if (depth>=1)
         while (ip<ilimit) {
             ip ++;
-            if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+            if ( (dictMode == ZSTD_noDict)
+              && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
                 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
                 if ((mlRep >= 4) && (gain2 > gain1))
                     matchLength = mlRep, offset = 0, start = ip;
             }
-            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+            if (dictMode == ZSTD_dictMatchState) {
+                const U32 repIndex = (U32)(ip - base) - offset_1;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                    int const gain2 = (int)(mlRep * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
+                }
+            }
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -535,15 +736,31 @@ size_t ZSTD_compressBlock_lazy_generic(
             /* let's find an even better one */
             if ((depth==2) && (ip<ilimit)) {
                 ip ++;
-                if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
-                    size_t const ml2 = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
-                    int const gain2 = (int)(ml2 * 4);
+                if ( (dictMode == ZSTD_noDict)
+                  && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(mlRep * 4);
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
-                    if ((ml2 >= 4) && (gain2 > gain1))
-                        matchLength = ml2, offset = 0, start = ip;
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
                 }
-                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                if (dictMode == ZSTD_dictMatchState) {
+                    const U32 repIndex = (U32)(ip - base) - offset_1;
+                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                        && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                        const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                        size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                        int const gain2 = (int)(mlRep * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((mlRep >= 4) && (gain2 > gain1))
+                            matchLength = mlRep, offset = 0, start = ip;
+                    }
+                }
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -560,9 +777,17 @@ size_t ZSTD_compressBlock_lazy_generic(
          */
         /* catch up */
         if (offset) {
-            while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > base))
-                 && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
-                { start--; matchLength++; }
+            if (dictMode == ZSTD_noDict) {
+                while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
+                     && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
+                    { start--; matchLength++; }
+            }
+            if (dictMode == ZSTD_dictMatchState) {
+                U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            }
             offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
         }
         /* store sequence */
@@ -573,16 +798,39 @@ _storeSequence:
         }
 
         /* check immediate repcode */
-        while ( ((ip <= ilimit) & (offset_2>0))
-             && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
-            /* store sequence */
-            matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-            offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
-            ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
-            ip += matchLength;
-            anchor = ip;
-            continue;   /* faster when present ... (?) */
-    }   }
+        if (dictMode == ZSTD_dictMatchState) {
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex = current2 - offset_2;
+                const BYTE* repMatch = dictMode == ZSTD_dictMatchState
+                    && repIndex < prefixLowestIndex ?
+                        dictBase - dictIndexDelta + repIndex :
+                        base + repIndex;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+                    offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                    ip += matchLength;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        if (dictMode == ZSTD_noDict) {
+            while ( ((ip <= ilimit) & (offset_2>0))
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                /* store sequence */
+                matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
 
     /* Save reps for next block */
     rep[0] = offset_1 ? offset_1 : savedOffset;
@@ -595,30 +843,58 @@ _storeSequence:
 
 size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_noDict);
 }
 
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 1, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, 0, 0, ZSTD_dictMatchState);
 }
 
 
@@ -626,7 +902,6 @@ FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_lazy_extDict_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
-                        ZSTD_compressionParameters const* cParams,
                         const void* src, size_t srcSize,
                         const U32 searchMethod, const U32 depth)
 {
@@ -644,9 +919,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     const BYTE* const dictStart  = dictBase + lowestIndex;
 
     typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
 
     U32 offset_1 = rep[0], offset_2 = rep[1];
 
@@ -674,8 +949,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
@@ -707,8 +982,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -737,8 +1012,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -794,31 +1069,31 @@ _storeSequence:
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 0);
 }
 
 size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 1);
 }
 
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 0, 2);
 }
 
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, 1, 2);
 }
diff --git a/lib/compress/zstd_lazy.h b/lib/compress/zstd_lazy.h
index bda064f1997b..ef85a6df9c8f 100644
--- a/lib/compress/zstd_lazy.h
+++ b/lib/compress/zstd_lazy.h
@@ -17,37 +17,48 @@ extern "C" {
 
 #include "zstd_compress_internal.h"
 
-U32 ZSTD_insertAndFindFirstIndex(
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-        const BYTE* ip);
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 
 void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */
 
 size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c
index bffd8a3dfaa5..6238ddecf24f 100644
--- a/lib/compress/zstd_ldm.c
+++ b/lib/compress/zstd_ldm.c
@@ -9,6 +9,7 @@
 
 #include "zstd_ldm.h"
 
+#include "debug.h"
 #include "zstd_fast.h"          /* ZSTD_fillHashTable() */
 #include "zstd_double_fast.h"   /* ZSTD_fillDoubleHashTable() */
 
@@ -20,7 +21,7 @@
 void ZSTD_ldm_adjustParameters(ldmParams_t* params,
                                ZSTD_compressionParameters const* cParams)
 {
-    U32 const windowLog = cParams->windowLog;
+    params->windowLog = cParams->windowLog;
     ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
     DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
     if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
@@ -33,12 +34,13 @@ void ZSTD_ldm_adjustParameters(ldmParams_t* params,
       params->minMatchLength = minMatch;
     }
     if (params->hashLog == 0) {
-        params->hashLog = MAX(ZSTD_HASHLOG_MIN, windowLog - LDM_HASH_RLOG);
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
         assert(params->hashLog <= ZSTD_HASHLOG_MAX);
     }
     if (params->hashEveryLog == 0) {
-        params->hashEveryLog =
-                windowLog < params->hashLog ? 0 : windowLog - params->hashLog;
+        params->hashEveryLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
     }
     params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
 }
@@ -216,21 +218,18 @@ static size_t ZSTD_ldm_countBackwardsMatch(
  *  The tables for the other strategies are filled within their
  *  block compressors. */
 static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
-                                      ZSTD_compressionParameters const* cParams,
                                       void const* end)
 {
     const BYTE* const iend = (const BYTE*)end;
 
-    switch(cParams->strategy)
+    switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, cParams, iend);
-        ms->nextToUpdate = (U32)(iend - ms->window.base);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, cParams, iend);
-        ms->nextToUpdate = (U32)(iend - ms->window.base);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
         break;
 
     case ZSTD_greedy:
@@ -508,7 +507,7 @@ size_t ZSTD_ldm_generateSequences(
          *       * Try invalidation after the sequence generation and test the
          *         the offset against maxDist directly.
          */
-        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL);
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL, NULL);
         /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
         newLeftoverSize = ZSTD_ldm_generateSequences_internal(
             ldmState, sequences, params, chunkStart, chunkSize);
@@ -591,19 +590,19 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
 
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-    ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
-    int const extDict)
+    void const* src, size_t srcSize)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     unsigned const minMatch = cParams->searchLength;
     ZSTD_blockCompressor const blockCompressor =
-        ZSTD_selectBlockCompressor(cParams->strategy, extDict);
-    BYTE const* const base = ms->window.base;
+        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
     /* Input bounds */
     BYTE const* const istart = (BYTE const*)src;
     BYTE const* const iend = istart + srcSize;
     /* Input positions */
     BYTE const* ip = istart;
 
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
     assert(rawSeqStore->pos <= rawSeqStore->size);
     assert(rawSeqStore->size <= rawSeqStore->capacity);
     /* Loop through each sequence and apply the block compressor to the lits */
@@ -621,14 +620,13 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
 
         /* Fill tables for block compressor */
         ZSTD_ldm_limitTableUpdate(ms, ip);
-        ZSTD_ldm_fillFastTables(ms, cParams, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
         /* Run the block compressor */
+        DEBUGLOG(5, "calling block compressor on segment of size %u", sequence.litLength);
         {
             size_t const newLitLength =
-                blockCompressor(ms, seqStore, rep, cParams, ip,
-                                sequence.litLength);
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
             ip += sequence.litLength;
-            ms->nextToUpdate = (U32)(ip - base);
             /* Update the repcodes */
             for (i = ZSTD_REP_NUM - 1; i > 0; i--)
                 rep[i] = rep[i-1];
@@ -642,12 +640,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     }
     /* Fill the tables for the block compressor */
     ZSTD_ldm_limitTableUpdate(ms, ip);
-    ZSTD_ldm_fillFastTables(ms, cParams, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
     /* Compress the last literals */
-    {
-        size_t const lastLiterals = blockCompressor(ms, seqStore, rep, cParams,
-                                                    ip, iend - ip);
-        ms->nextToUpdate = (U32)(iend - base);
-        return lastLiterals;
-    }
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
 }
diff --git a/lib/compress/zstd_ldm.h b/lib/compress/zstd_ldm.h
index 0c3789ff137c..21fba4d591a4 100644
--- a/lib/compress/zstd_ldm.h
+++ b/lib/compress/zstd_ldm.h
@@ -61,9 +61,7 @@ size_t ZSTD_ldm_generateSequences(
  */
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-            ZSTD_compressionParameters const* cParams,
-            void const* src, size_t srcSize,
-            int const extDict);
+            void const* src, size_t srcSize);
 
 /**
  * ZSTD_ldm_skipSequences():
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index f63f0c585224..8af69a91d46e 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -9,10 +9,11 @@
  */
 
 #include "zstd_compress_internal.h"
+#include "hist.h"
 #include "zstd_opt.h"
 
 
-#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
 #define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to init next stats */
 #define ZSTD_MAX_PRICE     (1<<30)
 
@@ -20,128 +21,210 @@
 /*-*************************************
 *  Price functions for optimal parser
 ***************************************/
-static void ZSTD_setLog2Prices(optState_t* optPtr)
+
+#if 0    /* approximation at bit level */
+#  define BITCOST_ACCURACY 0
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat)  ((void)opt, ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#else    /* opt==approx, ultra==accurate */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
 {
-    optPtr->log2litSum = ZSTD_highbit32(optPtr->litSum+1);
-    optPtr->log2litLengthSum = ZSTD_highbit32(optPtr->litLengthSum+1);
-    optPtr->log2matchLengthSum = ZSTD_highbit32(optPtr->matchLengthSum+1);
-    optPtr->log2offCodeSum = ZSTD_highbit32(optPtr->offCodeSum+1);
+    return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
 }
 
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+{
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + BITCOST_ACCURACY < 31);
+    return weight;
+}
+
+/* debugging function, @return price in bytes */
+MEM_STATIC double ZSTD_fCost(U32 price)
+{
+    return (double)price / (BITCOST_MULTIPLIER*8);
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+    optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+    optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+    optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+    optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
+}
+
+
+static U32 ZSTD_downscaleStat(U32* table, U32 lastEltIndex, int malus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+    for (s=0; s<=lastEltIndex; s++) {
+        table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+        sum += table[s];
+    }
+    return sum;
+}
 
 static void ZSTD_rescaleFreqs(optState_t* const optPtr,
-                              const BYTE* const src, size_t const srcSize)
+                              const BYTE* const src, size_t const srcSize,
+                              int optLevel)
 {
-    optPtr->staticPrices = 0;
+    optPtr->priceType = zop_dynamic;
 
-    if (optPtr->litLengthSum == 0) {  /* first init */
-        unsigned u;
-        if (srcSize <= 1024) optPtr->staticPrices = 1;
+    if (optPtr->litLengthSum == 0) {  /* first block : init */
+        if (srcSize <= 1024)   /* heuristic */
+            optPtr->priceType = zop_predef;
+
+        assert(optPtr->symbolCosts != NULL);
+        if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { /* huffman table presumed generated by dictionary */
+            optPtr->priceType = zop_dynamic;
+
+            assert(optPtr->litFreq != NULL);
+            optPtr->litSum = 0;
+            {   unsigned lit;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
+                    assert(bitCost <= scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 10;   /* scale to 1K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
+
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
+
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* not a dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            {   unsigned lit = MaxLit;
+                HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+            }
+            optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+
+            {   unsigned ll;
+                for (ll=0; ll<=MaxLL; ll++)
+                    optPtr->litLengthFreq[ll] = 1;
+            }
+            optPtr->litLengthSum = MaxLL+1;
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+            }
+            optPtr->matchLengthSum = MaxML+1;
+
+            {   unsigned of;
+                for (of=0; of<=MaxOff; of++)
+                    optPtr->offCodeFreq[of] = 1;
+            }
+            optPtr->offCodeSum = MaxOff+1;
 
-        assert(optPtr->litFreq!=NULL);
-        for (u=0; u<=MaxLit; u++)
-            optPtr->litFreq[u] = 0;
-        for (u=0; u<srcSize; u++)
-            optPtr->litFreq[src[u]]++;
-        optPtr->litSum = 0;
-        for (u=0; u<=MaxLit; u++) {
-            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> ZSTD_FREQ_DIV);
-            optPtr->litSum += optPtr->litFreq[u];
         }
 
-        for (u=0; u<=MaxLL; u++)
-            optPtr->litLengthFreq[u] = 1;
-        optPtr->litLengthSum = MaxLL+1;
-        for (u=0; u<=MaxML; u++)
-            optPtr->matchLengthFreq[u] = 1;
-        optPtr->matchLengthSum = MaxML+1;
-        for (u=0; u<=MaxOff; u++)
-            optPtr->offCodeFreq[u] = 1;
-        optPtr->offCodeSum = (MaxOff+1);
+    } else {   /* new block : re-use previous statistics, scaled down */
 
-    } else {
-        unsigned u;
-
-        optPtr->litSum = 0;
-        for (u=0; u<=MaxLit; u++) {
-            optPtr->litFreq[u] = 1 + (optPtr->litFreq[u] >> (ZSTD_FREQ_DIV+1));
-            optPtr->litSum += optPtr->litFreq[u];
-        }
-        optPtr->litLengthSum = 0;
-        for (u=0; u<=MaxLL; u++) {
-            optPtr->litLengthFreq[u] = 1 + (optPtr->litLengthFreq[u]>>(ZSTD_FREQ_DIV+1));
-            optPtr->litLengthSum += optPtr->litLengthFreq[u];
-        }
-        optPtr->matchLengthSum = 0;
-        for (u=0; u<=MaxML; u++) {
-            optPtr->matchLengthFreq[u] = 1 + (optPtr->matchLengthFreq[u]>>ZSTD_FREQ_DIV);
-            optPtr->matchLengthSum += optPtr->matchLengthFreq[u];
-        }
-        optPtr->offCodeSum = 0;
-        for (u=0; u<=MaxOff; u++) {
-            optPtr->offCodeFreq[u] = 1 + (optPtr->offCodeFreq[u]>>ZSTD_FREQ_DIV);
-            optPtr->offCodeSum += optPtr->offCodeFreq[u];
-        }
+        optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+        optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+        optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+        optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0);
     }
 
-    ZSTD_setLog2Prices(optPtr);
+    ZSTD_setBasePrices(optPtr, optLevel);
 }
 
-
 /* ZSTD_rawLiteralsCost() :
- * cost of literals (only) in given segment (which length can be null)
- * does not include cost of literalLength symbol */
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
 static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
-                                const optState_t* const optPtr)
+                                const optState_t* const optPtr,
+                                int optLevel)
 {
-    if (optPtr->staticPrices) return (litLength*6);  /* 6 bit per literal - no statistic used */
     if (litLength == 0) return 0;
+    if (optPtr->priceType == zop_predef)
+        return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
 
-    /* literals */
-    {   U32 u;
-        U32 cost = litLength * optPtr->log2litSum;
-        for (u=0; u < litLength; u++)
-            cost -= ZSTD_highbit32(optPtr->litFreq[literals[u]]+1);
-        return cost;
-    }
-}
-
-/* ZSTD_litLengthPrice() :
- * cost of literalLength symbol */
-static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr)
-{
-    if (optPtr->staticPrices) return ZSTD_highbit32((U32)litLength+1);
-
-    /* literal Length */
-    {   U32 const llCode = ZSTD_LLcode(litLength);
-        U32 const price = LL_bits[llCode] + optPtr->log2litLengthSum - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1);
+    /* dynamic statistics */
+    {   U32 price = litLength * optPtr->litSumBasePrice;
+        U32 u;
+        for (u=0; u < litLength; u++) {
+            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+        }
         return price;
     }
 }
 
 /* ZSTD_litLengthPrice() :
- * cost of the literal part of a sequence,
- * including literals themselves, and literalLength symbol */
-static U32 ZSTD_fullLiteralsCost(const BYTE* const literals, U32 const litLength,
-                                 const optState_t* const optPtr)
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
 {
-    return ZSTD_rawLiteralsCost(literals, litLength, optPtr)
-         + ZSTD_litLengthPrice(litLength, optPtr);
+    if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel);
+
+    /* dynamic statistics */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        return (LL_bits[llCode] * BITCOST_MULTIPLIER) + (optPtr->litLengthSumBasePrice - WEIGHT(optPtr->litLengthFreq[llCode], optLevel));
+    }
 }
 
 /* ZSTD_litLengthContribution() :
  * @return ( cost(litlength) - cost(0) )
  * this value can then be added to rawLiteralsCost()
  * to provide a cost which is directly comparable to a match ending at same position */
-static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr)
+static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr, int optLevel)
 {
-    if (optPtr->staticPrices) return ZSTD_highbit32(litLength+1);
+    if (optPtr->priceType >= zop_predef) return WEIGHT(litLength, optLevel);
 
-    /* literal Length */
+    /* dynamic statistics */
     {   U32 const llCode = ZSTD_LLcode(litLength);
-        int const contribution = LL_bits[llCode]
-                        + ZSTD_highbit32(optPtr->litLengthFreq[0]+1)
-                        - ZSTD_highbit32(optPtr->litLengthFreq[llCode]+1);
+        int const contribution = (LL_bits[llCode] * BITCOST_MULTIPLIER)
+                               + WEIGHT(optPtr->litLengthFreq[0], optLevel)   /* note: log2litLengthSum cancel out */
+                               - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
 #if 1
         return contribution;
 #else
@@ -155,10 +238,11 @@ static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* con
  * which can be compared to the ending cost of a match
  * should a new match start at this position */
 static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLength,
-                                     const optState_t* const optPtr)
+                                     const optState_t* const optPtr,
+                                     int optLevel)
 {
-    int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr)
-                           + ZSTD_litLengthContribution(litLength, optPtr);
+    int const contribution = ZSTD_rawLiteralsCost(literals, litLength, optPtr, optLevel)
+                           + ZSTD_litLengthContribution(litLength, optPtr, optLevel);
     return contribution;
 }
 
@@ -166,31 +250,38 @@ static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLe
  * Provides the cost of the match part (offset + matchLength) of a sequence
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
  * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_getMatchPrice(
-                                    U32 const offset, U32 const matchLength,
-                                    const optState_t* const optPtr,
-                                    int const optLevel)
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offset,
+                   U32 const matchLength,
+                   const optState_t* const optPtr,
+                   int const optLevel)
 {
     U32 price;
     U32 const offCode = ZSTD_highbit32(offset+1);
     U32 const mlBase = matchLength - MINMATCH;
     assert(matchLength >= MINMATCH);
 
-    if (optPtr->staticPrices)  /* fixed scheme, do not use statistics */
-        return ZSTD_highbit32((U32)mlBase+1) + 16 + offCode;
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
 
-    price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1);
-    if ((optLevel<2) /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */
+    /* dynamic statistics */
+    price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+    if ((optLevel<2) /*static*/ && offCode >= 20)
+        price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
 
     /* match Length */
     {   U32 const mlCode = ZSTD_MLcode(mlBase);
-        price += ML_bits[mlCode] + optPtr->log2matchLengthSum - ZSTD_highbit32(optPtr->matchLengthFreq[mlCode]+1);
+        price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
     }
 
+    price += BITCOST_MULTIPLIER / 5;   /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
     DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
     return price;
 }
 
+/* ZSTD_updateStats() :
+ * assumption : literals + litLengtn <= iend */
 static void ZSTD_updateStats(optState_t* const optPtr,
                              U32 litLength, const BYTE* literals,
                              U32 offsetCode, U32 matchLength)
@@ -269,10 +360,11 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, const BYTE*
  *  ip : assumed <= iend-8 .
  * @return : nb of positions added */
 static U32 ZSTD_insertBt1(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
-                U32 const mls, U32 const extDict)
+                U32 const mls, const int extDict)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32*   const hashTable = ms->hashTable;
     U32    const hashLog = cParams->hashLog;
     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
@@ -293,6 +385,7 @@ static U32 ZSTD_insertBt1(
     U32* largerPtr  = smallerPtr + 1;
     U32 dummy32;   /* to be nullified at the end */
     U32 const windowLow = ms->window.lowLimit;
+    U32 const matchLow = windowLow ? windowLow : 1;
     U32 matchEndIdx = current+8+1;
     size_t bestLength = 8;
     U32 nbCompares = 1U << cParams->searchLog;
@@ -308,7 +401,7 @@ static U32 ZSTD_insertBt1(
     assert(ip <= iend-8);   /* required for h calculation */
     hashTable[h] = current;   /* Update Hash Table */
 
-    while (nbCompares-- && (matchIndex > windowLow)) {
+    while (nbCompares-- && (matchIndex >= matchLow)) {
         U32* const nextPtr = bt + 2*(matchIndex & btMask);
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         assert(matchIndex < current);
@@ -334,8 +427,8 @@ static U32 ZSTD_insertBt1(
         }
 #endif
 
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
-            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if extDict is incorrectly set to 0 */
+        if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if actually extDict */
             match = base + matchIndex;
             matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
         } else {
@@ -379,35 +472,33 @@ static U32 ZSTD_insertBt1(
 
 FORCE_INLINE_TEMPLATE
 void ZSTD_updateTree_internal(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
-                const U32 mls, const U32 extDict)
+                const U32 mls, const ZSTD_dictMode_e dictMode)
 {
     const BYTE* const base = ms->window.base;
     U32 const target = (U32)(ip - base);
     U32 idx = ms->nextToUpdate;
-    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (extDict:%u)",
-                idx, target, extDict);
+    DEBUGLOG(5, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                idx, target, dictMode);
 
     while(idx < target)
-        idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, extDict);
+        idx += ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict);
     ms->nextToUpdate = target;
 }
 
-void ZSTD_updateTree(
-                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                const BYTE* ip, const BYTE* iend)
-{
-    ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, 0 /*extDict*/);
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+    ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.searchLength, ZSTD_noDict);
 }
 
 FORCE_INLINE_TEMPLATE
 U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                    const BYTE* const ip, const BYTE* const iLimit, int const extDict,
+                    ZSTD_matchState_t* ms,
+                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
                     U32 rep[ZSTD_REP_NUM], U32 const ll0,
                     ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
     const BYTE* const base = ms->window.base;
     U32 const current = (U32)(ip-base);
@@ -426,6 +517,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
     const BYTE* const prefixStart = base + dictLimit;
     U32 const btLow = btMask >= current ? 0 : current - btMask;
     U32 const windowLow = ms->window.lowLimit;
+    U32 const matchLow = windowLow ? windowLow : 1;
     U32* smallerPtr = bt + 2*(current&btMask);
     U32* largerPtr  = bt + 2*(current&btMask) + 1;
     U32 matchEndIdx = current+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
@@ -433,8 +525,21 @@ U32 ZSTD_insertBtAndGetAllMatches (
     U32 mnum = 0;
     U32 nbCompares = 1U << cParams->searchLog;
 
+    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const ZSTD_compressionParameters* const dmsCParams =
+                                      dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+    const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+    const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+    U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+    U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+    U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+    U32         const dmsHashLog    = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+    U32         const dmsBtLog      = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+    U32         const dmsBtMask     = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
+
     size_t bestLength = lengthToBeat-1;
-    DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches");
+    DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current);
 
     /* check repCode */
     {   U32 const lastR = ZSTD_REP_NUM + ll0;
@@ -449,18 +554,26 @@ U32 ZSTD_insertBtAndGetAllMatches (
                     repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
                 }
             } else {  /* repIndex < dictLimit || repIndex >= current */
-                const BYTE* const repMatch = dictBase + repIndex;
+                const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+                                             dmsBase + repIndex - dmsIndexDelta :
+                                             dictBase + repIndex;
                 assert(current >= windowLow);
-                if ( extDict /* this case only valid in extDict mode */
+                if ( dictMode == ZSTD_extDict
                   && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow)  /* equivalent to `current > repIndex >= windowLow` */
                      & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
                   && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
                     repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                }
+                if (dictMode == ZSTD_dictMatchState
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `current > repIndex >= dmsLowLimit` */
+                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
             }   }
             /* save longer solution */
             if (repLen > bestLength) {
-                DEBUGLOG(8, "found rep-match %u of length %u",
-                            repCode - ll0, (U32)repLen);
+                DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                            repCode, ll0, repOffset, repLen);
                 bestLength = repLen;
                 matches[mnum].off = repCode - ll0;
                 matches[mnum].len = (U32)repLen;
@@ -473,10 +586,10 @@ U32 ZSTD_insertBtAndGetAllMatches (
     /* HC3 match finder */
     if ((mls == 3) /*static*/ && (bestLength < mls)) {
         U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, ip);
-        if ((matchIndex3 > windowLow)
+        if ((matchIndex3 >= matchLow)
           & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
             size_t mlen;
-            if ((!extDict) /*static*/ || (matchIndex3 >= dictLimit)) {
+            if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
                 const BYTE* const match = base + matchIndex3;
                 mlen = ZSTD_count(ip, match, iLimit);
             } else {
@@ -498,17 +611,21 @@ U32 ZSTD_insertBtAndGetAllMatches (
                      (ip+mlen == iLimit) ) {  /* best possible length */
                     ms->nextToUpdate = current+1;  /* skip insertion */
                     return 1;
-    }   }   }   }
+                }
+            }
+        }
+        /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+    }
 
     hashTable[h] = current;   /* Update Hash Table */
 
-    while (nbCompares-- && (matchIndex > windowLow)) {
+    while (nbCompares-- && (matchIndex >= matchLow)) {
         U32* const nextPtr = bt + 2*(matchIndex & btMask);
         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
         const BYTE* match;
         assert(current > matchIndex);
 
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+        if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
             assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
             match = base + matchIndex;
             matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
@@ -520,8 +637,8 @@ U32 ZSTD_insertBtAndGetAllMatches (
         }
 
         if (matchLength > bestLength) {
-            DEBUGLOG(8, "found match of length %u at distance %u",
-                    (U32)matchLength, current - matchIndex);
+            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+                    (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
@@ -529,9 +646,10 @@ U32 ZSTD_insertBtAndGetAllMatches (
             matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
             matches[mnum].len = (U32)matchLength;
             mnum++;
-            if (matchLength > ZSTD_OPT_NUM) break;
-            if (ip+matchLength == iLimit) {  /* equal : no way to know if inf or sup */
-                break;   /* drop, to preserve bt consistency (miss a little bit of compression) */
+            if ( (matchLength > ZSTD_OPT_NUM)
+               | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+                break; /* drop, to preserve bt consistency (miss a little bit of compression) */
             }
         }
 
@@ -552,6 +670,47 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
     *smallerPtr = *largerPtr = 0;
 
+    if (dictMode == ZSTD_dictMatchState && nbCompares) {
+        size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+        U32 dictMatchIndex = dms->hashTable[dmsH];
+        const U32* const dmsBt = dms->chainTable;
+        commonLengthSmaller = commonLengthLarger = 0;
+        while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match = dmsBase + dictMatchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+            if (dictMatchIndex+matchLength >= dmsHighLimit)
+                match = base + dictMatchIndex + dmsIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+            if (matchLength > bestLength) {
+                matchIndex = dictMatchIndex + dmsIndexDelta;
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+                        (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                bestLength = matchLength;
+                matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+                matches[mnum].len = (U32)matchLength;
+                mnum++;
+                if ( (matchLength > ZSTD_OPT_NUM)
+                   | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */
+            if (match[matchLength] < ip[matchLength]) {
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                commonLengthLarger = matchLength;
+                dictMatchIndex = nextPtr[0];
+            }
+        }
+    }
+
     assert(matchEndIdx > current+8);
     ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
     return mnum;
@@ -559,23 +718,24 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
 
 FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
-                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-                        const BYTE* ip, const BYTE* const iHighLimit, int const extDict,
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
                         U32 rep[ZSTD_REP_NUM], U32 const ll0,
                         ZSTD_match_t* matches, U32 const lengthToBeat)
 {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const matchLengthSearch = cParams->searchLength;
-    DEBUGLOG(7, "ZSTD_BtGetAllMatches");
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, extDict);
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
     switch(matchLengthSearch)
     {
-    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 3);
+    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 3);
     default :
-    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 4);
-    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 5);
+    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 5);
     case 7 :
-    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 6);
+    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, ip, iHighLimit, dictMode, rep, ll0, matches, lengthToBeat, 6);
     }
 }
 
@@ -587,7 +747,7 @@ typedef struct repcodes_s {
     U32 rep[3];
 } repcodes_t;
 
-repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+static repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
 {
     repcodes_t newReps;
     if (offset >= ZSTD_REP_NUM) {  /* full offset */
@@ -609,65 +769,17 @@ repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
 }
 
 
-typedef struct {
-    const BYTE* anchor;
-    U32 litlen;
-    U32 rawLitCost;
-} cachedLiteralPrice_t;
-
-static U32 ZSTD_rawLiteralsCost_cached(
-                            cachedLiteralPrice_t* const cachedLitPrice,
-                            const BYTE* const anchor, U32 const litlen,
-                            const optState_t* const optStatePtr)
+static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
 {
-    U32 startCost;
-    U32 remainingLength;
-    const BYTE* startPosition;
-
-    if (anchor == cachedLitPrice->anchor) {
-        startCost = cachedLitPrice->rawLitCost;
-        startPosition = anchor + cachedLitPrice->litlen;
-        assert(litlen >= cachedLitPrice->litlen);
-        remainingLength = litlen - cachedLitPrice->litlen;
-    } else {
-        startCost = 0;
-        startPosition = anchor;
-        remainingLength = litlen;
-    }
-
-    {   U32 const rawLitCost = startCost + ZSTD_rawLiteralsCost(startPosition, remainingLength, optStatePtr);
-        cachedLitPrice->anchor = anchor;
-        cachedLitPrice->litlen = litlen;
-        cachedLitPrice->rawLitCost = rawLitCost;
-        return rawLitCost;
-    }
+    return sol.litlen + sol.mlen;
 }
 
-static U32 ZSTD_fullLiteralsCost_cached(
-                            cachedLiteralPrice_t* const cachedLitPrice,
-                            const BYTE* const anchor, U32 const litlen,
-                            const optState_t* const optStatePtr)
-{
-    return ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr)
-         + ZSTD_litLengthPrice(litlen, optStatePtr);
-}
-
-static int ZSTD_literalsContribution_cached(
-                            cachedLiteralPrice_t* const cachedLitPrice,
-                            const BYTE* const anchor, U32 const litlen,
-                            const optState_t* const optStatePtr)
-{
-    int const contribution = ZSTD_rawLiteralsCost_cached(cachedLitPrice, anchor, litlen, optStatePtr)
-                           + ZSTD_litLengthContribution(litlen, optStatePtr);
-    return contribution;
-}
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore,
-                                      U32 rep[ZSTD_REP_NUM],
-                                      ZSTD_compressionParameters const* cParams,
-                                      const void* src, size_t srcSize,
-                                      const int optLevel, const int extDict)
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                               seqStore_t* seqStore,
+                               U32 rep[ZSTD_REP_NUM],
+                               const void* src, size_t srcSize,
+                               const int optLevel, const ZSTD_dictMode_e dictMode)
 {
     optState_t* const optStatePtr = &ms->opt;
     const BYTE* const istart = (const BYTE*)src;
@@ -677,72 +789,76 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore
     const BYTE* const ilimit = iend - 8;
     const BYTE* const base = ms->window.base;
     const BYTE* const prefixStart = base + ms->window.dictLimit;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
 
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
     U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4;
 
     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
     ZSTD_match_t* const matches = optStatePtr->matchTable;
-    cachedLiteralPrice_t cachedLitPrice;
+    ZSTD_optimal_t lastSequence;
 
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_opt_generic");
+    assert(optLevel <= 2);
     ms->nextToUpdate3 = ms->nextToUpdate;
-    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize);
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
     ip += (ip==prefixStart);
-    memset(&cachedLitPrice, 0, sizeof(cachedLitPrice));
 
     /* Match Loop */
     while (ip < ilimit) {
         U32 cur, last_pos = 0;
-        U32 best_mlen, best_off;
 
         /* find first match */
         {   U32 const litlen = (U32)(ip - anchor);
             U32 const ll0 = !litlen;
-            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, extDict, rep, ll0, matches, minMatch);
+            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, ip, iend, dictMode, rep, ll0, matches, minMatch);
             if (!nbMatches) { ip++; continue; }
 
             /* initialize opt[0] */
             { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
-            opt[0].mlen = 1;
+            opt[0].mlen = 0;  /* means is_a_literal */
             opt[0].litlen = litlen;
+            opt[0].price = ZSTD_literalsContribution(anchor, litlen, optStatePtr, optLevel);
 
             /* large match -> immediate encoding */
             {   U32 const maxML = matches[nbMatches-1].len;
-                DEBUGLOG(7, "found %u matches of maxLength=%u and offset=%u at cPos=%u => start new serie",
-                            nbMatches, maxML, matches[nbMatches-1].off, (U32)(ip-prefixStart));
+                U32 const maxOffset = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new serie",
+                            nbMatches, maxML, maxOffset, (U32)(ip-prefixStart));
 
                 if (maxML > sufficient_len) {
-                    best_mlen = maxML;
-                    best_off = matches[nbMatches-1].off;
-                    DEBUGLOG(7, "large match (%u>%u), immediate encoding",
-                                best_mlen, sufficient_len);
+                    lastSequence.litlen = litlen;
+                    lastSequence.mlen = maxML;
+                    lastSequence.off = maxOffset;
+                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                maxML, sufficient_len);
                     cur = 0;
-                    last_pos = 1;
+                    last_pos = ZSTD_totalLen(lastSequence);
                     goto _shortestPath;
             }   }
 
             /* set prices for first matches starting position == 0 */
-            {   U32 const literalsPrice = ZSTD_fullLiteralsCost_cached(&cachedLitPrice, anchor, litlen, optStatePtr);
+            {   U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
                 U32 pos;
                 U32 matchNb;
-                for (pos = 0; pos < minMatch; pos++) {
-                    opt[pos].mlen = 1;
-                    opt[pos].price = ZSTD_MAX_PRICE;
+                for (pos = 1; pos < minMatch; pos++) {
+                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
                 }
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
                     U32 const offset = matches[matchNb].off;
                     U32 const end = matches[matchNb].len;
                     repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0);
                     for ( ; pos <= end ; pos++ ) {
-                        U32 const matchPrice = literalsPrice + ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
-                        DEBUGLOG(7, "rPos:%u => set initial price : %u",
-                                    pos, matchPrice);
+                        U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+                        U32 const sequencePrice = literalsPrice + matchPrice;
+                        DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                    pos, ZSTD_fCost(sequencePrice));
                         opt[pos].mlen = pos;
                         opt[pos].off = offset;
                         opt[pos].litlen = litlen;
-                        opt[pos].price = matchPrice;
+                        opt[pos].price = sequencePrice;
+                        ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory));
                         memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
                 }   }
                 last_pos = pos-1;
@@ -753,55 +869,67 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore
         for (cur = 1; cur <= last_pos; cur++) {
             const BYTE* const inr = ip + cur;
             assert(cur < ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
 
             /* Fix current position with one literal if cheaper */
-            {   U32 const litlen = (opt[cur-1].mlen == 1) ? opt[cur-1].litlen + 1 : 1;
-                int price;  /* note : contribution can be negative */
-                if (cur > litlen) {
-                    price = opt[cur - litlen].price + ZSTD_literalsContribution(inr-litlen, litlen, optStatePtr);
-                } else {
-                    price = ZSTD_literalsContribution_cached(&cachedLitPrice, anchor, litlen, optStatePtr);
-                }
+            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+                int const price = opt[cur-1].price
+                                + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+                                + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+                                - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
                 assert(price < 1000000000); /* overflow check */
                 if (price <= opt[cur].price) {
-                    DEBUGLOG(7, "rPos:%u : better price (%u<%u) using literal",
-                                cur, price, opt[cur].price);
-                    opt[cur].mlen = 1;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+                    opt[cur].mlen = 0;
                     opt[cur].off = 0;
                     opt[cur].litlen = litlen;
                     opt[cur].price = price;
                     memcpy(opt[cur].rep, opt[cur-1].rep, sizeof(opt[cur].rep));
-            }   }
+                } else {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+                }
+            }
 
             /* last match must start at a minimum distance of 8 from oend */
             if (inr > ilimit) continue;
 
             if (cur == last_pos) break;
 
-             if ( (optLevel==0) /*static*/
-               && (opt[cur+1].price <= opt[cur].price) )
+            if ( (optLevel==0) /*static_test*/
+              && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+            }
 
-            {   U32 const ll0 = (opt[cur].mlen != 1);
-                U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0;
-                U32 const previousPrice = (cur > litlen) ? opt[cur-litlen].price : 0;
-                U32 const basePrice = previousPrice + ZSTD_fullLiteralsCost(inr-litlen, litlen, optStatePtr);
-                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, extDict, opt[cur].rep, ll0, matches, minMatch);
+            {   U32 const ll0 = (opt[cur].mlen != 0);
+                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+                U32 const previousPrice = opt[cur].price;
+                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, inr, iend, dictMode, opt[cur].rep, ll0, matches, minMatch);
                 U32 matchNb;
-                if (!nbMatches) continue;
+                if (!nbMatches) {
+                    DEBUGLOG(7, "rPos:%u : no match found", cur);
+                    continue;
+                }
 
                 {   U32 const maxML = matches[nbMatches-1].len;
-                    DEBUGLOG(7, "rPos:%u, found %u matches, of maxLength=%u",
-                                cur, nbMatches, maxML);
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+                                inr-istart, cur, nbMatches, maxML);
 
                     if ( (maxML > sufficient_len)
-                       | (cur + maxML >= ZSTD_OPT_NUM) ) {
-                        best_mlen = maxML;
-                        best_off = matches[nbMatches-1].off;
-                        last_pos = cur + 1;
+                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+                        lastSequence.mlen = maxML;
+                        lastSequence.off = matches[nbMatches-1].off;
+                        lastSequence.litlen = litlen;
+                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+                        last_pos = cur + ZSTD_totalLen(lastSequence);
+                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
                         goto _shortestPath;
-                    }
-                }
+                }   }
 
                 /* set prices using matches found at position == cur */
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
@@ -811,81 +939,97 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore
                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                     U32 mlen;
 
-                    DEBUGLOG(7, "testing match %u => offCode=%u, mlen=%u, llen=%u",
+                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
                                 matchNb, matches[matchNb].off, lastML, litlen);
 
-                    for (mlen = lastML; mlen >= startML; mlen--) {
+                    for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
                         U32 const pos = cur + mlen;
                         int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
 
                         if ((pos > last_pos) || (price < opt[pos].price)) {
-                            DEBUGLOG(7, "rPos:%u => new better price (%u<%u)",
-                                        pos, price, opt[pos].price);
-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
                             opt[pos].mlen = mlen;
                             opt[pos].off = offset;
                             opt[pos].litlen = litlen;
                             opt[pos].price = price;
+                            ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory));
                             memcpy(opt[pos].rep, &repHistory, sizeof(repHistory));
                         } else {
-                            if (optLevel==0) break;  /* gets ~+10% speed for about -0.01 ratio loss */
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
                         }
             }   }   }
         }  /* for (cur = 1; cur <= last_pos; cur++) */
 
-        best_mlen = opt[last_pos].mlen;
-        best_off = opt[last_pos].off;
-        cur = last_pos - best_mlen;
+        lastSequence = opt[last_pos];
+        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
 
 _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
-        assert(opt[0].mlen == 1);
+        assert(opt[0].mlen == 0);
 
-        /* reverse traversal */
-        DEBUGLOG(7, "start reverse traversal (last_pos:%u, cur:%u)",
-                    last_pos, cur);
-        {   U32 selectedMatchLength = best_mlen;
-            U32 selectedOffset = best_off;
-            U32 pos = cur;
-            while (1) {
-                U32 const mlen = opt[pos].mlen;
-                U32 const off = opt[pos].off;
-                opt[pos].mlen = selectedMatchLength;
-                opt[pos].off = selectedOffset;
-                selectedMatchLength = mlen;
-                selectedOffset = off;
-                if (mlen > pos) break;
-                pos -= mlen;
-        }   }
+        {   U32 const storeEnd = cur + 1;
+            U32 storeStart = storeEnd;
+            U32 seqPos = cur;
 
-        /* save sequences */
-        {   U32 pos;
-            for (pos=0; pos < last_pos; ) {
-                U32 const llen = (U32)(ip - anchor);
-                U32 const mlen = opt[pos].mlen;
-                U32 const offset = opt[pos].off;
-                if (mlen == 1) { ip++; pos++; continue; }  /* literal position => move on */
-                pos += mlen; ip += mlen;
+            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                        last_pos, cur); (void)last_pos;
+            assert(storeEnd < ZSTD_OPT_NUM);
+            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+            opt[storeEnd] = lastSequence;
+            while (seqPos > 0) {
+                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+                storeStart--;
+                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+                opt[storeStart] = opt[seqPos];
+                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+            }
 
-                /* repcodes update : like ZSTD_updateRep(), but update in place */
-                if (offset >= ZSTD_REP_NUM) {  /* full offset */
-                    rep[2] = rep[1];
-                    rep[1] = rep[0];
-                    rep[0] = offset - ZSTD_REP_MOVE;
-                } else {   /* repcode */
-                    U32 const repCode = offset + (llen==0);
-                    if (repCode) {  /* note : if repCode==0, no change */
-                        U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
-                        if (repCode >= 2) rep[2] = rep[1];
-                        rep[1] = rep[0];
-                        rep[0] = currentOffset;
+            /* save sequences */
+            DEBUGLOG(6, "sending selected sequences into seqStore")
+            {   U32 storePos;
+                for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                    U32 const llen = opt[storePos].litlen;
+                    U32 const mlen = opt[storePos].mlen;
+                    U32 const offCode = opt[storePos].off;
+                    U32 const advance = llen + mlen;
+                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                anchor - istart, llen, mlen);
+
+                    if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                        assert(storePos == storeEnd);   /* must be last sequence */
+                        ip = anchor + llen;     /* last "sequence" is a bunch of literals => don't progress anchor */
+                        continue;   /* will finish */
                     }
-                }
 
-                ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen);
-                ZSTD_storeSeq(seqStore, llen, anchor, offset, mlen-MINMATCH);
-                anchor = ip;
-        }   }
-        ZSTD_setLog2Prices(optStatePtr);
+                    /* repcodes update : like ZSTD_updateRep(), but update in place */
+                    if (offCode >= ZSTD_REP_NUM) {  /* full offset */
+                        rep[2] = rep[1];
+                        rep[1] = rep[0];
+                        rep[0] = offCode - ZSTD_REP_MOVE;
+                    } else {   /* repcode */
+                        U32 const repCode = offCode + (llen==0);
+                        if (repCode) {  /* note : if repCode==0, no change */
+                            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+                            if (repCode >= 2) rep[2] = rep[1];
+                            rep[1] = rep[0];
+                            rep[0] = currentOffset;
+                    }   }
+
+                    assert(anchor + llen <= iend);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, offCode, mlen-MINMATCH);
+                    anchor += advance;
+                    ip = anchor;
+            }   }
+            ZSTD_setBasePrices(optStatePtr, optLevel);
+        }
+
     }   /* while (ip < ilimit) */
 
     /* Return the last literals size */
@@ -895,29 +1039,94 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
 
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+}
+
+
+/* used in 2-pass strategy */
+static U32 ZSTD_upscaleStat(U32* table, U32 lastEltIndex, int bonus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+bonus > 0);
+    for (s=0; s<=lastEltIndex; s++) {
+        table[s] <<= ZSTD_FREQ_DIV+bonus;
+        table[s]--;
+        sum += table[s];
+    }
+    return sum;
+}
+
+/* used in 2-pass strategy */
+MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
+{
+    optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
+    optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 1);
+    optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 1);
+    optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 1);
 }
 
 size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/);
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+#if 0
+    /* 2-pass strategy (disabled)
+     * this strategy makes a first pass over first block to collect statistics
+     * and seed next round's statistics with it.
+     * The compression ratio gain is generally small (~0.5% on first block),
+     * the cost is 2x cpu time on first block. */
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    if ( (ms->opt.litLengthSum==0)   /* first block */
+      && (seqStore->sequences == seqStore->sequencesStart)   /* no ldm */
+      && (ms->window.dictLimit == ms->window.lowLimit) ) {   /* no dictionary */
+        U32 tmpRep[ZSTD_REP_NUM];
+        DEBUGLOG(5, "ZSTD_compressBlock_btultra: first block: collecting statistics");
+        assert(ms->nextToUpdate >= ms->window.dictLimit
+            && ms->nextToUpdate <= ms->window.dictLimit + 1);
+        memcpy(tmpRep, rep, sizeof(tmpRep));
+        ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
+        ZSTD_resetSeqStore(seqStore);
+        /* invalidate first scan from history */
+        ms->window.base -= srcSize;
+        ms->window.dictLimit += (U32)srcSize;
+        ms->window.lowLimit = ms->window.dictLimit;
+        ms->nextToUpdate = ms->window.dictLimit;
+        ms->nextToUpdate3 = ms->window.dictLimit;
+        /* re-inforce weight of collected statistics */
+        ZSTD_upscaleStats(&ms->opt);
+    }
+#endif
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
 }
 
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+        const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
 }
diff --git a/lib/compress/zstd_opt.h b/lib/compress/zstd_opt.h
index b8dc389f317a..eeadb604c6a8 100644
--- a/lib/compress/zstd_opt.h
+++ b/lib/compress/zstd_opt.h
@@ -17,23 +17,29 @@ extern "C" {
 
 #include "zstd_compress_internal.h"
 
-void ZSTD_updateTree(
-        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
-        const BYTE* ip, const BYTE* iend);  /* used in ZSTD_loadDictionaryContent() */
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
 
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+        void const* src, size_t srcSize);
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c
index c7a205d8c757..f4aba1d2c494 100644
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@@ -37,18 +37,19 @@
 #define ZSTD_RESIZE_SEQPOOL 0
 
 /* ======   Debug   ====== */
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \
+    && !defined(_MSC_VER) \
+    && !defined(__MINGW32__)
 
 #  include <stdio.h>
 #  include <unistd.h>
 #  include <sys/times.h>
-#  define DEBUGLOGRAW(l, ...) if (l<=ZSTD_DEBUG) { fprintf(stderr, __VA_ARGS__); }
 
 #  define DEBUG_PRINTHEX(l,p,n) {            \
     unsigned debug_u;                        \
     for (debug_u=0; debug_u<(n); debug_u++)  \
-        DEBUGLOGRAW(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
-    DEBUGLOGRAW(l, " \n");                   \
+        RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+    RAWLOG(l, " \n");                        \
 }
 
 static unsigned long long GetCurrentClockTimeMicroseconds(void)
@@ -62,7 +63,7 @@ static unsigned long long GetCurrentClockTimeMicroseconds(void)
 
 #define MUTEX_WAIT_TIME_DLEVEL 6
 #define ZSTD_PTHREAD_MUTEX_LOCK(mutex) {          \
-    if (ZSTD_DEBUG >= MUTEX_WAIT_TIME_DLEVEL) {   \
+    if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) {   \
         unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
         ZSTD_pthread_mutex_lock(mutex);           \
         {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
@@ -160,6 +161,25 @@ static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const
     ZSTD_pthread_mutex_unlock(&bufPool->poolMutex);
 }
 
+
+static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, U32 nbWorkers)
+{
+    unsigned const maxNbBuffers = 2*nbWorkers + 3;
+    if (srcBufPool==NULL) return NULL;
+    if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
+        return srcBufPool;
+    /* need a larger buffer pool */
+    {   ZSTD_customMem const cMem = srcBufPool->cMem;
+        size_t const bSize = srcBufPool->bufferSize;   /* forward parameters */
+        ZSTDMT_bufferPool* newBufPool;
+        ZSTDMT_freeBufferPool(srcBufPool);
+        newBufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+        if (newBufPool==NULL) return newBufPool;
+        ZSTDMT_setBufferSize(newBufPool, bSize);
+        return newBufPool;
+    }
+}
+
 /** ZSTDMT_getBuffer() :
  *  assumption : bufPool must be valid
  * @return : a buffer, with start pointer and size
@@ -229,8 +249,8 @@ static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer)
 /* store buffer for later re-use, up to pool capacity */
 static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
 {
-    if (buf.start == NULL) return;   /* compatible with release on NULL */
     DEBUGLOG(5, "ZSTDMT_releaseBuffer");
+    if (buf.start == NULL) return;   /* compatible with release on NULL */
     ZSTD_pthread_mutex_lock(&bufPool->poolMutex);
     if (bufPool->nbBuffers < bufPool->totalBuffers) {
         bufPool->bTable[bufPool->nbBuffers++] = buf;  /* stored for later use */
@@ -300,7 +320,8 @@ static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
 
 static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
 {
-    ZSTDMT_seqPool* seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+    if (seqPool == NULL) return NULL;
     ZSTDMT_setNbSeq(seqPool, 0);
     return seqPool;
 }
@@ -310,6 +331,10 @@ static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool)
     ZSTDMT_freeBufferPool(seqPool);
 }
 
+static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers)
+{
+    return ZSTDMT_expandBufferPool(pool, nbWorkers);
+}
 
 
 /* =====   CCtx Pool   ===== */
@@ -355,6 +380,18 @@ static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbWorkers,
     return cctxPool;
 }
 
+static ZSTDMT_CCtxPool* ZSTDMT_expandCCtxPool(ZSTDMT_CCtxPool* srcPool,
+                                              unsigned nbWorkers)
+{
+    if (srcPool==NULL) return NULL;
+    if (nbWorkers <= srcPool->totalCCtx) return srcPool;   /* good enough */
+    /* need a larger cctx pool */
+    {   ZSTD_customMem const cMem = srcPool->cMem;
+        ZSTDMT_freeCCtxPool(srcPool);
+        return ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    }
+}
+
 /* only works during initialization phase, not during compression */
 static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool)
 {
@@ -425,12 +462,11 @@ typedef struct {
     ZSTD_window_t ldmWindow;  /* A thread-safe copy of ldmState.window */
 } serialState_t;
 
-static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params)
+static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params, size_t jobSize)
 {
     /* Adjust parameters */
     if (params.ldmParams.enableLdm) {
         DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
-        params.ldmParams.windowLog = params.cParams.windowLog;
         ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
         assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
         assert(params.ldmParams.hashEveryLog < 32);
@@ -453,7 +489,7 @@ static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool*
             serialState->params.ldmParams.hashLog -
             serialState->params.ldmParams.bucketSizeLog;
         /* Size the seq pool tables */
-        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, params.jobSize));
+        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
         /* Reset the window */
         ZSTD_window_clear(&serialState->ldmState.window);
         serialState->ldmWindow = serialState->ldmState.window;
@@ -473,6 +509,7 @@ static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool*
         memset(serialState->ldmState.bucketOffsets, 0, bucketSize);
     }
     serialState->params = params;
+    serialState->params.jobSize = (U32)jobSize;
     return 0;
 }
 
@@ -505,6 +542,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
     /* Wait for our turn */
     ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex);
     while (serialState->nextJobID < jobID) {
+        DEBUGLOG(5, "wait for serialState->cond");
         ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex);
     }
     /* A future job may error and skip our job */
@@ -514,6 +552,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
             size_t error;
             assert(seqStore.seq != NULL && seqStore.pos == 0 &&
                    seqStore.size == 0 && seqStore.capacity > 0);
+            assert(src.size <= serialState->params.jobSize);
             ZSTD_window_update(&serialState->ldmState.window, src.start, src.size);
             error = ZSTD_ldm_generateSequences(
                 &serialState->ldmState, &seqStore,
@@ -593,14 +632,32 @@ typedef struct {
     unsigned frameChecksumNeeded;        /* used only by mtctx */
 } ZSTDMT_jobDescription;
 
+#define JOB_ERROR(e) {                          \
+    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);   \
+    job->cSize = e;                             \
+    ZSTD_pthread_mutex_unlock(&job->job_mutex); \
+    goto _endJob;                               \
+}
+
 /* ZSTDMT_compressionJob() is a POOL_function type */
-void ZSTDMT_compressionJob(void* jobDescription)
+static void ZSTDMT_compressionJob(void* jobDescription)
 {
     ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
     ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
     ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
     rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool);
     buffer_t dstBuff = job->dstBuff;
+    size_t lastCBlockSize = 0;
+
+    /* ressources */
+    if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation));
+    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
+        dstBuff = ZSTDMT_getBuffer(job->bufPool);
+        if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation));
+        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
+    }
+    if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL)
+        JOB_ERROR(ERROR(memory_allocation));
 
     /* Don't compute the checksum for chunks, since we compute it externally,
      * but write it in the header.
@@ -609,47 +666,31 @@ void ZSTDMT_compressionJob(void* jobDescription)
     /* Don't run LDM for the chunks, since we handle it externally */
     jobParams.ldmParams.enableLdm = 0;
 
-    /* ressources */
-    if (cctx==NULL) {
-        job->cSize = ERROR(memory_allocation);
-        goto _endJob;
-    }
-    if (dstBuff.start == NULL) {   /* streaming job : doesn't provide a dstBuffer */
-        dstBuff = ZSTDMT_getBuffer(job->bufPool);
-        if (dstBuff.start==NULL) {
-            job->cSize = ERROR(memory_allocation);
-            goto _endJob;
-        }
-        job->dstBuff = dstBuff;   /* this value can be read in ZSTDMT_flush, when it copies the whole job */
-    }
 
     /* init */
     if (job->cdict) {
-        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, job->cdict, jobParams, job->fullFrameSize);
+        size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, jobParams, job->fullFrameSize);
         assert(job->firstJob);  /* only allowed for first job */
-        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
+        if (ZSTD_isError(initError)) JOB_ERROR(initError);
     } else {  /* srcStart points at reloaded section */
         U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
         {   size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob);
-            if (ZSTD_isError(forceWindowError)) {
-                job->cSize = forceWindowError;
-                goto _endJob;
-        }   }
+            if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
+        }
         {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
                                         job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        ZSTD_dtlm_fast,
                                         NULL, /*cdict*/
                                         jobParams, pledgedSrcSize);
-            if (ZSTD_isError(initError)) {
-                job->cSize = initError;
-                goto _endJob;
-    }   }   }
+            if (ZSTD_isError(initError)) JOB_ERROR(initError);
+    }   }
 
     /* Perform serial step as early as possible, but after CCtx initialization */
     ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID);
 
     if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
         size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
-        if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; }
+        if (ZSTD_isError(hSize)) JOB_ERROR(hSize);
         DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
         ZSTD_invalidateRepCodes(cctx);
     }
@@ -667,7 +708,7 @@ void ZSTDMT_compressionJob(void* jobDescription)
         assert(job->cSize == 0);
         for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
             size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
-            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
             ip += chunkSize;
             op += cSize; assert(op < oend);
             /* stats */
@@ -680,18 +721,16 @@ void ZSTDMT_compressionJob(void* jobDescription)
             ZSTD_pthread_mutex_unlock(&job->job_mutex);
         }
         /* last block */
-        assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
+        assert(chunkSize > 0);
+        assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
         if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
             size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
             size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
             size_t const cSize = (job->lastJob) ?
                  ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
                  ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
-            if (ZSTD_isError(cSize)) { job->cSize = cSize; goto _endJob; }
-            /* stats */
-            ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-            job->cSize += cSize;
-            ZSTD_pthread_mutex_unlock(&job->job_mutex);
+            if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
+            lastCBlockSize = cSize;
     }   }
 
 _endJob:
@@ -704,7 +743,9 @@ _endJob:
     ZSTDMT_releaseCCtx(job->cctxPool, cctx);
     /* report */
     ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-    job->consumed = job->src.size;
+    if (ZSTD_isError(job->cSize)) assert(lastCBlockSize == 0);
+    job->cSize += lastCBlockSize;
+    job->consumed = job->src.size;  /* when job->consumed == job->src.size , compression job is presumed completed */
     ZSTD_pthread_cond_signal(&job->job_cond);
     ZSTD_pthread_mutex_unlock(&job->job_mutex);
 }
@@ -745,9 +786,9 @@ struct ZSTDMT_CCtx_s {
     ZSTD_CCtx_params params;
     size_t targetSectionSize;
     size_t targetPrefixSize;
-    roundBuff_t roundBuff;
+    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create a new job. */
     inBuff_t inBuff;
-    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create another one. */
+    roundBuff_t roundBuff;
     serialState_t serial;
     unsigned singleBlockingThread;
     unsigned jobIDMask;
@@ -798,6 +839,20 @@ static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_custom
     return jobTable;
 }
 
+static size_t ZSTDMT_expandJobsTable (ZSTDMT_CCtx* mtctx, U32 nbWorkers) {
+    U32 nbJobs = nbWorkers + 2;
+    if (nbJobs > mtctx->jobIDMask+1) {  /* need more job capacity */
+        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
+        mtctx->jobIDMask = 0;
+        mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, mtctx->cMem);
+        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
+        assert((nbJobs != 0) && ((nbJobs & (nbJobs - 1)) == 0));  /* ensure nbJobs is a power of 2 */
+        mtctx->jobIDMask = nbJobs - 1;
+    }
+    return 0;
+}
+
+
 /* ZSTDMT_CCtxParam_setNbWorkers():
  * Internal use only */
 size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers)
@@ -875,7 +930,7 @@ static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx)
         unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
         ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
         while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
-            DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
+            DEBUGLOG(4, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
             ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
         }
         ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
@@ -924,6 +979,8 @@ size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params,
         if ( (value > 0)  /* value==0 => automatic job size */
            & (value < ZSTDMT_JOBSIZE_MIN) )
             value = ZSTDMT_JOBSIZE_MIN;
+        if (value > ZSTDMT_JOBSIZE_MAX)
+            value = ZSTDMT_JOBSIZE_MAX;
         params->jobSize = value;
         return value;
     case ZSTDMT_p_overlapSectionLog :
@@ -950,6 +1007,21 @@ size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter,
     }
 }
 
+size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned* value)
+{
+    switch (parameter) {
+    case ZSTDMT_p_jobSize:
+        *value = mtctx->params.jobSize;
+        break;
+    case ZSTDMT_p_overlapSectionLog:
+        *value = mtctx->params.overlapSizeLog;
+        break;
+    default:
+        return ERROR(parameter_unsupported);
+    }
+    return 0;
+}
+
 /* Sets parameters relevant to the compression job,
  * initializing others to default values. */
 static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(ZSTD_CCtx_params const params)
@@ -960,13 +1032,30 @@ static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(ZSTD_CCtx_params const params)
     jobParams.cParams = params.cParams;
     jobParams.fParams = params.fParams;
     jobParams.compressionLevel = params.compressionLevel;
-    jobParams.disableLiteralCompression = params.disableLiteralCompression;
 
     return jobParams;
 }
 
+
+/* ZSTDMT_resize() :
+ * @return : error code if fails, 0 on success */
+static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers)
+{
+    if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation);
+    CHECK_F( ZSTDMT_expandJobsTable(mtctx, nbWorkers) );
+    mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, nbWorkers);
+    if (mtctx->bufPool == NULL) return ERROR(memory_allocation);
+    mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
+    if (mtctx->cctxPool == NULL) return ERROR(memory_allocation);
+    mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers);
+    if (mtctx->seqPool == NULL) return ERROR(memory_allocation);
+    ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers);
+    return 0;
+}
+
+
 /*! ZSTDMT_updateCParams_whileCompressing() :
- *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  Updates a selected set of compression parameters, remaining compatible with currently active frame.
  *  New parameters will be applied to next compression job. */
 void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams)
 {
@@ -981,38 +1070,36 @@ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_p
     }
 }
 
-/* ZSTDMT_getNbWorkers():
- * @return nb threads currently active in mtctx.
- * mtctx must be valid */
-unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx)
-{
-    assert(mtctx != NULL);
-    return mtctx->params.nbWorkers;
-}
-
 /* ZSTDMT_getFrameProgression():
  * tells how much data has been consumed (input) and produced (output) for current frame.
  * able to count progression inside worker threads.
- * Note : mutex will be acquired during statistics collection. */
+ * Note : mutex will be acquired during statistics collection inside workers. */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
 {
     ZSTD_frameProgression fps;
-    DEBUGLOG(6, "ZSTDMT_getFrameProgression");
-    fps.consumed = mtctx->consumed;
-    fps.produced = mtctx->produced;
+    DEBUGLOG(5, "ZSTDMT_getFrameProgression");
     fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
+    fps.consumed = mtctx->consumed;
+    fps.produced = fps.flushed = mtctx->produced;
+    fps.currentJobID = mtctx->nextJobID;
+    fps.nbActiveWorkers = 0;
     {   unsigned jobNb;
         unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
         DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
                     mtctx->doneJobID, lastJobNb, mtctx->jobReady)
         for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) {
             unsigned const wJobID = jobNb & mtctx->jobIDMask;
-            ZSTD_pthread_mutex_lock(&mtctx->jobs[wJobID].job_mutex);
-            {   size_t const cResult = mtctx->jobs[wJobID].cSize;
+            ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID];
+            ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+            {   size_t const cResult = jobPtr->cSize;
                 size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
-                fps.consumed += mtctx->jobs[wJobID].consumed;
-                fps.ingested += mtctx->jobs[wJobID].src.size;
+                size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+                assert(flushed <= produced);
+                fps.ingested += jobPtr->src.size;
+                fps.consumed += jobPtr->consumed;
                 fps.produced += produced;
+                fps.flushed  += flushed;
+                fps.nbActiveWorkers += (jobPtr->consumed < jobPtr->src.size);
             }
             ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
         }
@@ -1021,6 +1108,34 @@ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
 }
 
 
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx)
+{
+    size_t toFlush;
+    unsigned const jobID = mtctx->doneJobID;
+    assert(jobID <= mtctx->nextJobID);
+    if (jobID == mtctx->nextJobID) return 0;   /* no active job => nothing to flush */
+
+    /* look into oldest non-fully-flushed job */
+    {   unsigned const wJobID = jobID & mtctx->jobIDMask;
+        ZSTDMT_jobDescription* const jobPtr = &mtctx->jobs[wJobID];
+        ZSTD_pthread_mutex_lock(&jobPtr->job_mutex);
+        {   size_t const cResult = jobPtr->cSize;
+            size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
+            size_t const flushed = ZSTD_isError(cResult) ? 0 : jobPtr->dstFlushed;
+            assert(flushed <= produced);
+            toFlush = produced - flushed;
+            if (toFlush==0 && (jobPtr->consumed >= jobPtr->src.size)) {
+                /* doneJobID is not-fully-flushed, but toFlush==0 : doneJobID should be compressing some more data */
+                assert(jobPtr->consumed < jobPtr->src.size);
+            }
+        }
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+    }
+
+    return toFlush;
+}
+
+
 /* ------------------------------------------ */
 /* =====   Multi-threaded compression   ===== */
 /* ------------------------------------------ */
@@ -1087,18 +1202,10 @@ static size_t ZSTDMT_compress_advanced_internal(
 
     assert(avgJobSize >= 256 KB);  /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */
     ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) );
-    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params))
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, avgJobSize))
         return ERROR(memory_allocation);
 
-    if (nbJobs > mtctx->jobIDMask+1) {  /* enlarge job table */
-        U32 jobsTableSize = nbJobs;
-        ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem);
-        mtctx->jobIDMask = 0;
-        mtctx->jobs = ZSTDMT_createJobsTable(&jobsTableSize, mtctx->cMem);
-        if (mtctx->jobs==NULL) return ERROR(memory_allocation);
-        assert((jobsTableSize != 0) && ((jobsTableSize & (jobsTableSize - 1)) == 0));  /* ensure jobsTableSize is a power of 2 */
-        mtctx->jobIDMask = jobsTableSize - 1;
-    }
+    CHECK_F( ZSTDMT_expandJobsTable(mtctx, nbJobs) );  /* only expands if necessary */
 
     {   unsigned u;
         for (u=0; u<nbJobs; u++) {
@@ -1221,17 +1328,18 @@ size_t ZSTDMT_initCStream_internal(
         const ZSTD_CDict* cdict, ZSTD_CCtx_params params,
         unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u, disableLiteralCompression=%i)",
-                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx, params.disableLiteralCompression);
-    /* params are supposed to be fully validated at this point */
+    DEBUGLOG(4, "ZSTDMT_initCStream_internal (pledgedSrcSize=%u, nbWorkers=%u, cctxPool=%u)",
+                (U32)pledgedSrcSize, params.nbWorkers, mtctx->cctxPool->totalCCtx);
+
+    /* params supposed partially fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
     assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
-    assert(mtctx->cctxPool->totalCCtx == params.nbWorkers);
 
     /* init */
-    if (params.jobSize == 0) {
-        params.jobSize = 1U << ZSTDMT_computeTargetJobLog(params);
-    }
+    if (params.nbWorkers != mtctx->params.nbWorkers)
+        CHECK_F( ZSTDMT_resize(mtctx, params.nbWorkers) );
+
+    if (params.jobSize > 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN;
     if (params.jobSize > ZSTDMT_JOBSIZE_MAX) params.jobSize = ZSTDMT_JOBSIZE_MAX;
 
     mtctx->singleBlockingThread = (pledgedSrcSize <= ZSTDMT_JOBSIZE_MIN);  /* do not trigger multi-threading when srcSize is too small */
@@ -1270,7 +1378,9 @@ size_t ZSTDMT_initCStream_internal(
     mtctx->targetPrefixSize = (size_t)1 << ZSTDMT_computeOverlapLog(params);
     DEBUGLOG(4, "overlapLog=%u => %u KB", params.overlapSizeLog, (U32)(mtctx->targetPrefixSize>>10));
     mtctx->targetSectionSize = params.jobSize;
-    if (mtctx->targetSectionSize < ZSTDMT_JOBSIZE_MIN) mtctx->targetSectionSize = ZSTDMT_JOBSIZE_MIN;
+    if (mtctx->targetSectionSize == 0) {
+        mtctx->targetSectionSize = 1ULL << ZSTDMT_computeTargetJobLog(params);
+    }
     if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
     DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), params.jobSize);
     DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
@@ -1312,7 +1422,7 @@ size_t ZSTDMT_initCStream_internal(
     mtctx->allJobsCompleted = 0;
     mtctx->consumed = 0;
     mtctx->produced = 0;
-    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params))
+    if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize))
         return ERROR(memory_allocation);
     return 0;
 }
@@ -1420,7 +1530,7 @@ static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZS
         mtctx->jobs[jobID].jobID = mtctx->nextJobID;
         mtctx->jobs[jobID].firstJob = (mtctx->nextJobID==0);
         mtctx->jobs[jobID].lastJob = endFrame;
-        mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag;
+        mtctx->jobs[jobID].frameChecksumNeeded = mtctx->params.fParams.checksumFlag && endFrame && (mtctx->nextJobID>0);
         mtctx->jobs[jobID].dstFlushed = 0;
 
         /* Update the round buffer pos and clear the input buffer to be reset */
@@ -1468,6 +1578,8 @@ static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZS
 
 
 /*! ZSTDMT_flushProduced() :
+ *  flush whatever data has been produced but not yet flushed in current job.
+ *  move to next job if current one is fully flushed.
  * `output` : `pos` will be updated with amount of data flushed .
  * `blockToFlush` : if >0, the function will block and wait if there is no data available to flush .
  * @return : amount of data remaining within internal buffer, 0 if no more, 1 if unknown but > 0, or an error code */
@@ -1496,7 +1608,7 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
     /* try to flush something */
     {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
         size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
-        size_t const srcSize = mtctx->jobs[wJobID].src.size;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;       /* read-only, could be done after mutex lock, but no-declaration-after-statement */
         ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
         if (ZSTD_isError(cSize)) {
             DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
@@ -1516,6 +1628,7 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
             mtctx->jobs[wJobID].cSize += 4;  /* can write this shared value, as worker is no longer active */
             mtctx->jobs[wJobID].frameChecksumNeeded = 0;
         }
+
         if (cSize > 0) {   /* compression is ongoing or completed */
             size_t const toFlush = MIN(cSize - mtctx->jobs[wJobID].dstFlushed, output->size - output->pos);
             DEBUGLOG(5, "ZSTDMT_flushProduced: Flushing %u bytes from job %u (completion:%u/%u, generated:%u)",
@@ -1529,11 +1642,12 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
             output->pos += toFlush;
             mtctx->jobs[wJobID].dstFlushed += toFlush;  /* can write : this value is only used by mtctx */
 
-            if ( (srcConsumed == srcSize)    /* job completed */
+            if ( (srcConsumed == srcSize)    /* job is completed */
               && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
                 DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
                         mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
                 ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
+                DEBUGLOG(5, "dstBuffer released");
                 mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
                 mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
                 mtctx->consumed += srcSize;
@@ -1610,6 +1724,7 @@ static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
     range_t extDict;
     range_t prefix;
 
+    DEBUGLOG(5, "ZSTDMT_doesOverlapWindow");
     extDict.start = window.dictBase + window.lowLimit;
     extDict.size = window.dictLimit - window.lowLimit;
 
@@ -1630,12 +1745,13 @@ static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer)
 {
     if (mtctx->params.ldmParams.enableLdm) {
         ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
+        DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
         DEBUGLOG(5, "source  [0x%zx, 0x%zx)",
                     (size_t)buffer.start,
                     (size_t)buffer.start + buffer.capacity);
         ZSTD_PTHREAD_MUTEX_LOCK(mutex);
         while (ZSTDMT_doesOverlapWindow(buffer, mtctx->serial.ldmWindow)) {
-            DEBUGLOG(6, "Waiting for LDM to finish...");
+            DEBUGLOG(5, "Waiting for LDM to finish...");
             ZSTD_pthread_cond_wait(&mtctx->serial.ldmWindowCond, mutex);
         }
         DEBUGLOG(6, "Done waiting for LDM to finish");
@@ -1655,6 +1771,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
     size_t const target = mtctx->targetSectionSize;
     buffer_t buffer;
 
+    DEBUGLOG(5, "ZSTDMT_tryGetInputRange");
     assert(mtctx->inBuff.buffer.start == NULL);
     assert(mtctx->roundBuff.capacity >= target);
 
@@ -1668,7 +1785,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
         buffer.start = start;
         buffer.capacity = prefixSize;
         if (ZSTDMT_isOverlapped(buffer, inUse)) {
-            DEBUGLOG(6, "Waiting for buffer...");
+            DEBUGLOG(5, "Waiting for buffer...");
             return 0;
         }
         ZSTDMT_waitForLdmComplete(mtctx, buffer);
@@ -1680,7 +1797,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
     buffer.capacity = target;
 
     if (ZSTDMT_isOverlapped(buffer, inUse)) {
-        DEBUGLOG(6, "Waiting for buffer...");
+        DEBUGLOG(5, "Waiting for buffer...");
         return 0;
     }
     assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
@@ -1753,8 +1870,10 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
                 /* It is only possible for this operation to fail if there are
                  * still compression jobs ongoing.
                  */
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange failed");
                 assert(mtctx->doneJobID != mtctx->nextJobID);
-            }
+            } else
+                DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start);
         }
         if (mtctx->inBuff.buffer.start != NULL) {
             size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
@@ -1782,6 +1901,7 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
     /* check for potential compressed data ready to be flushed */
     {   size_t const remainingToFlush = ZSTDMT_flushProduced(mtctx, output, !forwardInputProgress, endOp); /* block if there was no forward input progress */
         if (input->pos < input->size) return MAX(remainingToFlush, 1);  /* input not consumed : do not end flush yet */
+        DEBUGLOG(5, "end of ZSTDMT_compressStream_generic: remainingToFlush = %u", (U32)remainingToFlush);
         return remainingToFlush;
     }
 }
diff --git a/lib/compress/zstdmt_compress.h b/lib/compress/zstdmt_compress.h
index f79e3b441800..12ad9f899b57 100644
--- a/lib/compress/zstdmt_compress.h
+++ b/lib/compress/zstdmt_compress.h
@@ -95,6 +95,11 @@ typedef enum {
  * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
 ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned value);
 
+/* ZSTDMT_getMTCtxParameter() :
+ * Query the ZSTDMT_CCtx for a parameter value.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, unsigned* value);
+
 
 /*! ZSTDMT_compressStream_generic() :
  *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
@@ -114,11 +119,21 @@ ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
  * ===  Not exposed in libzstd. Never invoke directly   ===
  * ======================================================== */
 
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_CCtxParam_setMTCtxParameter()
+ *  like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */
 size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);
 
-/* ZSTDMT_CCtxParam_setNbWorkers()
- * Set nbWorkers, and clamp it.
- * Also reset jobSize and overlapLog */
+/*! ZSTDMT_CCtxParam_setNbWorkers()
+ *  Set nbWorkers, and clamp it.
+ *  Also reset jobSize and overlapLog */
 size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
 
 /*! ZSTDMT_updateCParams_whileCompressing() :
@@ -126,14 +141,9 @@ size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorker
  *  New parameters will be applied to next compression job. */
 void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
 
-/* ZSTDMT_getNbWorkers():
- * @return nb threads currently active in mtctx.
- * mtctx must be valid */
-unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx);
-
-/* ZSTDMT_getFrameProgression():
- * tells how much data has been consumed (input) and produced (output) for current frame.
- * able to count progression inside worker threads.
+/*! ZSTDMT_getFrameProgression():
+ *  tells how much data has been consumed (input) and produced (output) for current frame.
+ *  able to count progression inside worker threads.
  */
 ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
 
diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c
index 73f5c46c0615..83ecaff01e84 100644
--- a/lib/decompress/huf_decompress.c
+++ b/lib/decompress/huf_decompress.c
@@ -1,6 +1,7 @@
 /* ******************************************************************
-   Huffman decoder, part of New Generation Entropy library
-   Copyright (C) 2013-2016, Yann Collet.
+   huff0 huffman decoder,
+   part of Finite State Entropy library
+   Copyright (C) 2013-present, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -29,16 +30,15 @@
 
     You can contact the author at :
     - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
-    - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 
 /* **************************************************************
 *  Dependencies
 ****************************************************************/
 #include <string.h>     /* memcpy, memset */
-#include "bitstream.h"  /* BIT_* */
 #include "compiler.h"
-#include "fse.h"        /* header compression */
+#include "bitstream.h"  /* BIT_* */
+#include "fse.h"        /* to compress headers */
 #define HUF_STATIC_LINKING_ONLY
 #include "huf.h"
 #include "error_private.h"
@@ -48,7 +48,6 @@
 *  Error Management
 ****************************************************************/
 #define HUF_isError ERR_isError
-#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
 #define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }
 
 
@@ -75,15 +74,15 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
 /*-***************************/
 /*  single-symbol decoding   */
 /*-***************************/
-typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol decoding */
 
-size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
 {
     U32 tableLog = 0;
     U32 nbSymbols = 0;
     size_t iSize;
     void* const dtPtr = DTable + 1;
-    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+    HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
 
     U32* rankVal;
     BYTE* huffWeight;
@@ -96,7 +95,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
 
     if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
 
-    HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
     iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
@@ -124,7 +123,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
             U32 const w = huffWeight[n];
             U32 const length = (1 << w) >> 1;
             U32 u;
-            HUF_DEltX2 D;
+            HUF_DEltX1 D;
             D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
             for (u = rankVal[w]; u < rankVal[w] + length; u++)
                 dt[u] = D;
@@ -134,17 +133,15 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
     return iSize;
 }
 
-size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_readDTableX2_wksp(DTable, src, srcSize,
+    return HUF_readDTableX1_wksp(DTable, src, srcSize,
                                  workSpace, sizeof(workSpace));
 }
 
-typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
-
 FORCE_INLINE_TEMPLATE BYTE
-HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
 {
     size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
     BYTE const c = dt[val].byte;
@@ -152,44 +149,44 @@ HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog
     return c;
 }
 
-#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
-    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
 
-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)  \
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
     if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
 
-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
     if (MEM_64bits()) \
-        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
 
 HINT_INLINE size_t
-HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
 {
     BYTE* const pStart = p;
 
     /* up to 4 symbols at a time */
     while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
-        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
     }
 
     /* [0-3] symbols remaining */
     if (MEM_32bits())
         while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
-            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
     /* no more data to retrieve from bitstream, no need to reload */
     while (p < pEnd)
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
     return pEnd-pStart;
 }
 
 FORCE_INLINE_TEMPLATE size_t
-HUF_decompress1X2_usingDTable_internal_body(
+HUF_decompress1X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
@@ -197,14 +194,14 @@ HUF_decompress1X2_usingDTable_internal_body(
     BYTE* op = (BYTE*)dst;
     BYTE* const oend = op + dstSize;
     const void* dtPtr = DTable + 1;
-    const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+    const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
     BIT_DStream_t bitD;
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
     U32 const dtLog = dtd.tableLog;
 
     CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
 
-    HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+    HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
 
     if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
 
@@ -212,7 +209,7 @@ HUF_decompress1X2_usingDTable_internal_body(
 }
 
 FORCE_INLINE_TEMPLATE size_t
-HUF_decompress4X2_usingDTable_internal_body(
+HUF_decompress4X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
@@ -224,7 +221,7 @@ HUF_decompress4X2_usingDTable_internal_body(
         BYTE* const ostart = (BYTE*) dst;
         BYTE* const oend = ostart + dstSize;
         const void* const dtPtr = DTable + 1;
-        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
 
         /* Init */
         BIT_DStream_t bitD1;
@@ -260,22 +257,22 @@ HUF_decompress4X2_usingDTable_internal_body(
         /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
         endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
         while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
             BIT_reloadDStream(&bitD1);
             BIT_reloadDStream(&bitD2);
             BIT_reloadDStream(&bitD3);
@@ -291,191 +288,10 @@ HUF_decompress4X2_usingDTable_internal_body(
         /* note : op4 supposed already verified within main loop */
 
         /* finish bitStreams one by one */
-        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
-        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
-        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
-        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
-
-        /* check */
-        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-          if (!endCheck) return ERROR(corruption_detected); }
-
-        /* decoded size */
-        return dstSize;
-    }
-}
-
-
-FORCE_INLINE_TEMPLATE U32
-HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
-{
-    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
-    memcpy(op, dt+val, 2);
-    BIT_skipBits(DStream, dt[val].nbBits);
-    return dt[val].length;
-}
-
-FORCE_INLINE_TEMPLATE U32
-HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
-{
-    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
-    memcpy(op, dt+val, 1);
-    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
-    else {
-        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
-            BIT_skipBits(DStream, dt[val].nbBits);
-            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
-                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
-                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
-    }   }
-    return 1;
-}
-
-#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
-    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
-    if (MEM_64bits()) \
-        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-HINT_INLINE size_t
-HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
-                const HUF_DEltX4* const dt, const U32 dtLog)
-{
-    BYTE* const pStart = p;
-
-    /* up to 8 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
-        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
-    }
-
-    /* closer to end : up to 2 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
-
-    while (p <= pEnd-2)
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
-
-    if (p < pEnd)
-        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
-
-    return p-pStart;
-}
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress1X4_usingDTable_internal_body(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    BIT_DStream_t bitD;
-
-    /* Init */
-    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
-
-    /* decode */
-    {   BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
-        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
-        DTableDesc const dtd = HUF_getDTableDesc(DTable);
-        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
-    }
-
-    /* check */
-    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
-
-    /* decoded size */
-    return dstSize;
-}
-
-
-FORCE_INLINE_TEMPLATE size_t
-HUF_decompress4X4_usingDTable_internal_body(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
-
-    {   const BYTE* const istart = (const BYTE*) cSrc;
-        BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable+1;
-        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
-
-        /* Init */
-        BIT_DStream_t bitD1;
-        BIT_DStream_t bitD2;
-        BIT_DStream_t bitD3;
-        BIT_DStream_t bitD4;
-        size_t const length1 = MEM_readLE16(istart);
-        size_t const length2 = MEM_readLE16(istart+2);
-        size_t const length3 = MEM_readLE16(istart+4);
-        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
-        const BYTE* const istart1 = istart + 6;  /* jumpTable */
-        const BYTE* const istart2 = istart1 + length1;
-        const BYTE* const istart3 = istart2 + length2;
-        const BYTE* const istart4 = istart3 + length3;
-        size_t const segmentSize = (dstSize+3) / 4;
-        BYTE* const opStart2 = ostart + segmentSize;
-        BYTE* const opStart3 = opStart2 + segmentSize;
-        BYTE* const opStart4 = opStart3 + segmentSize;
-        BYTE* op1 = ostart;
-        BYTE* op2 = opStart2;
-        BYTE* op3 = opStart3;
-        BYTE* op4 = opStart4;
-        U32 endSignal;
-        DTableDesc const dtd = HUF_getDTableDesc(DTable);
-        U32 const dtLog = dtd.tableLog;
-
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
-        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
-        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
-        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
-
-        /* 16-32 symbols per loop (4-8 symbols per stream) */
-        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
-            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
-
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        }
-
-        /* check corruption */
-        if (op1 > opStart2) return ERROR(corruption_detected);
-        if (op2 > opStart3) return ERROR(corruption_detected);
-        if (op3 > opStart4) return ERROR(corruption_detected);
-        /* note : op4 already verified within main loop */
-
-        /* finish bitStreams one by one */
-        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
-        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
-        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
-        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+        HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX1(op4, &bitD4, oend,     dt, dtLog);
 
         /* check */
         { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
@@ -493,7 +309,7 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
                                                const HUF_DTable *DTable);
 #if DYNAMIC_BMI2
 
-#define X(fn)                                                               \
+#define HUF_DGEN(fn)                                                               \
                                                                             \
     static size_t fn##_default(                                             \
                   void* dst,  size_t dstSize,                               \
@@ -522,7 +338,7 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
 
 #else
 
-#define X(fn)                                                               \
+#define HUF_DGEN(fn)                                                               \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
                      size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
     {                                                                       \
@@ -532,112 +348,114 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
 
 #endif
 
-X(HUF_decompress1X2_usingDTable_internal)
-X(HUF_decompress4X2_usingDTable_internal)
-X(HUF_decompress1X4_usingDTable_internal)
-X(HUF_decompress4X4_usingDTable_internal)
-
-#undef X
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
 
 
-size_t HUF_decompress1X2_usingDTable(
+
+size_t HUF_decompress1X1_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
 }
 
 
-size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
                               const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
-size_t HUF_decompress4X2_usingDTable(
+size_t HUF_decompress4X1_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize, int bmi2)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2_wksp (dctx, cSrc, cSrcSize,
+    size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
                                                 workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
 }
 
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
 }
 
 
-size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
 
 /* *************************/
 /* double-symbols decoding */
 /* *************************/
-typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
 
-/* HUF_fillDTableX4Level2() :
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+
+/* HUF_fillDTableX2Level2() :
  * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
-static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
                            const U32* rankValOrigin, const int minWeight,
                            const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
                            U32 nbBitsBaseline, U16 baseSeq)
 {
-    HUF_DEltX4 DElt;
+    HUF_DEltX2 DElt;
     U32 rankVal[HUF_TABLELOG_MAX + 1];
 
     /* get pre-calculated rankVal */
@@ -672,10 +490,8 @@ static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 co
     }   }
 }
 
-typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
-typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
 
-static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList, const U32 sortedListSize,
                            const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
                            const U32 nbBitsBaseline)
@@ -700,12 +516,12 @@ static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
             int minWeight = nbBits + scaleLog;
             if (minWeight < 1) minWeight = 1;
             sortedRank = rankStart[minWeight];
-            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+            HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
                            rankValOrigin[nbBits], minWeight,
                            sortedList+sortedRank, sortedListSize-sortedRank,
                            nbBitsBaseline, symbol);
         } else {
-            HUF_DEltX4 DElt;
+            HUF_DEltX2 DElt;
             MEM_writeLE16(&(DElt.sequence), symbol);
             DElt.nbBits = (BYTE)(nbBits);
             DElt.length = 1;
@@ -717,16 +533,16 @@ static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
     }
 }
 
-size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
-                             size_t srcSize, void* workSpace,
-                             size_t wkspSize)
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
 {
     U32 tableLog, maxW, sizeOfSort, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     U32 const maxTableLog = dtd.maxTableLog;
     size_t iSize;
     void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
-    HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
     U32 *rankStart;
 
     rankValCol_t* rankVal;
@@ -752,7 +568,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
     rankStart = rankStart0 + 1;
     memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
 
-    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
 
@@ -806,7 +622,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
                     rankValPtr[w] = rankVal0[w] >> consumed;
     }   }   }   }
 
-    HUF_fillDTableX4(dt, maxTableLog,
+    HUF_fillDTableX2(dt, maxTableLog,
                    sortedSymbol, sizeOfSort,
                    rankStart0, rankVal, maxW,
                    tableLog+1);
@@ -817,112 +633,296 @@ size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
     return iSize;
 }
 
-size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
 {
   U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-  return HUF_readDTableX4_wksp(DTable, src, srcSize,
+  return HUF_readDTableX2_wksp(DTable, src, srcSize,
                                workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress1X4_usingDTable(
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize,
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
                                                workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
 }
 
 
-size_t HUF_decompress1X4_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
                               const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress1X4_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
-size_t HUF_decompress4X4_usingDTable(
+size_t HUF_decompress4X2_usingDTable(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
     const HUF_DTable* DTable)
 {
     DTableDesc dtd = HUF_getDTableDesc(DTable);
     if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
-static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize, int bmi2)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize,
+    size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
                                          workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
 }
 
-size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
                                    void* workSpace, size_t wkspSize)
 {
-    return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
 }
 
 
-size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
                               const void* cSrc, size_t cSrcSize)
 {
     U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
                                        workSpace, sizeof(workSpace));
 }
 
-size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }
 
 
-/* ********************************/
-/* Generic decompression selector */
-/* ********************************/
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
 
 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
                                     const void* cSrc, size_t cSrcSize,
                                     const HUF_DTable* DTable)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
@@ -930,8 +930,8 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
                                     const HUF_DTable* DTable)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }
 
 
@@ -960,12 +960,12 @@ static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, qu
 /** HUF_selectDecoder() :
  *  Tells which decoder is likely to decode faster,
  *  based on a set of pre-computed metrics.
- * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
  *  Assumption : 0 < dstSize <= 128 KB */
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 {
     assert(dstSize > 0);
-    assert(dstSize <= 128 KB);
+    assert(dstSize <= 128*1024);
     /* decoder timing evaluation */
     {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
         U32 const D256 = (U32)(dstSize >> 8);
@@ -980,7 +980,7 @@ typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc,
 
 size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
 {
-    static const decompressionAlgo decompress[2] = { HUF_decompress4X2, HUF_decompress4X4 };
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
 
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1002,8 +1002,8 @@ size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const
     if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
-                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+        return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
     }
 }
 
@@ -1025,8 +1025,8 @@ size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
     if (cSrcSize == 0) return ERROR(corruption_detected);
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
-                        HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
     }
 }
 
@@ -1041,9 +1041,9 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
     if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc,
+        return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
                                 cSrcSize, workSpace, wkspSize):
-                        HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                        HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
                                 cSrcSize, workSpace, wkspSize);
     }
 }
@@ -1060,27 +1060,27 @@ size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
 size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
 }
 
-size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
 }
 
 size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
 }
 
 size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
@@ -1090,7 +1090,7 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
     if (cSrcSize == 0) return ERROR(corruption_detected);
 
     {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-        return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
-                        HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
     }
 }
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 3ec6a1cb328b..711b5b6d7aca 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -40,12 +40,24 @@
 #  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_DEFAULTMAX) + 1)
 #endif
 
+/*!
+ *  NO_FORWARD_PROGRESS_MAX :
+ *  maximum allowed nb of calls to ZSTD_decompressStream() and ZSTD_decompress_generic()
+ *  without any forward progress
+ *  (defined as: no byte read from input, and no byte flushed to output)
+ *  before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+#  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
 
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "cpu.h"
+#include "compiler.h"    /* prefetch */
+#include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
@@ -57,6 +69,9 @@
 #  include "zstd_legacy.h"
 #endif
 
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
 
 /*-*************************************
 *  Errors
@@ -99,11 +114,10 @@ typedef struct {
 #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
 
 typedef struct {
-    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];
-    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];
-    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
     HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
-    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
     U32 rep[ZSTD_REP_NUM];
 } ZSTD_entropyDTables_t;
 
@@ -114,9 +128,10 @@ struct ZSTD_DCtx_s
     const ZSTD_seqSymbol* OFTptr;
     const HUF_DTable* HUFptr;
     ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
     const void* previousDstEnd;   /* detect continuity */
-    const void* base;             /* start of current segment */
-    const void* vBase;            /* virtual start of previous segment if it was just before current one */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
     const void* dictEnd;          /* end of previous segment */
     size_t expected;
     ZSTD_frameHeader fParams;
@@ -127,7 +142,6 @@ struct ZSTD_DCtx_s
     U32 fseEntropy;
     XXH64_state_t xxhState;
     size_t headerSize;
-    U32 dictID;
     ZSTD_format_e format;
     const BYTE* litPtr;
     ZSTD_customMem customMem;
@@ -136,9 +150,13 @@ struct ZSTD_DCtx_s
     size_t staticSize;
     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
 
-    /* streaming */
+    /* dictionary */
     ZSTD_DDict* ddictLocal;
-    const ZSTD_DDict* ddict;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+
+    /* streaming */
     ZSTD_dStreamStage streamStage;
     char*  inBuff;
     size_t inBuffSize;
@@ -153,6 +171,7 @@ struct ZSTD_DCtx_s
     U32 previousLegacyVersion;
     U32 legacyVersion;
     U32 hostageByte;
+    int noForwardProgress;
 
     /* workspace */
     BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
@@ -173,7 +192,7 @@ size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
 static size_t ZSTD_startingInputLength(ZSTD_format_e format)
 {
     size_t const startingInputLength = (format==ZSTD_f_zstd1_magicless) ?
-                    ZSTD_frameHeaderSize_prefix - ZSTD_frameIdSize :
+                    ZSTD_frameHeaderSize_prefix - ZSTD_FRAMEIDSIZE :
                     ZSTD_frameHeaderSize_prefix;
     ZSTD_STATIC_ASSERT(ZSTD_FRAMEHEADERSIZE_PREFIX >= ZSTD_FRAMEIDSIZE);
     /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
@@ -188,10 +207,15 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
     dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
     dctx->ddict       = NULL;
     dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
     dctx->inBuff      = NULL;
     dctx->inBuffSize  = 0;
     dctx->outBuffSize = 0;
     dctx->streamStage = zdss_init;
+    dctx->legacyContext = NULL;
+    dctx->previousLegacyVersion = 0;
+    dctx->noForwardProgress = 0;
     dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
 }
 
@@ -215,8 +239,6 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
     {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem);
         if (!dctx) return NULL;
         dctx->customMem = customMem;
-        dctx->legacyContext = NULL;
-        dctx->previousLegacyVersion = 0;
         ZSTD_initDCtx_internal(dctx);
         return dctx;
     }
@@ -265,7 +287,7 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
  *  Note 3 : Skippable Frame Identifiers are considered valid. */
 unsigned ZSTD_isFrame(const void* buffer, size_t size)
 {
-    if (size < ZSTD_frameIdSize) return 0;
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
     {   U32 const magic = MEM_readLE32(buffer);
         if (magic == ZSTD_MAGICNUMBER) return 1;
         if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
@@ -298,25 +320,28 @@ static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZST
 
 /** ZSTD_frameHeaderSize() :
  *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
- * @return : size of the Frame Header */
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
 size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
 {
     return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
 }
 
 
-/** ZSTD_getFrameHeader_internal() :
+/** ZSTD_getFrameHeader_advanced() :
  *  decode Frame Header, or require larger `srcSize`.
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
  *           or an error code, which can be tested using ZSTD_isError() */
-static size_t ZSTD_getFrameHeader_internal(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
 {
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
+    memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
     if (srcSize < minInputSize) return minInputSize;
+    if (src==NULL) return ERROR(GENERIC);   /* invalid parameter */
 
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
@@ -325,7 +350,7 @@ static size_t ZSTD_getFrameHeader_internal(ZSTD_frameHeader* zfhPtr, const void*
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ZSTD_skippableHeaderSize; /* magic number + frame length */
             memset(zfhPtr, 0, sizeof(*zfhPtr));
-            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_frameIdSize);
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
             zfhPtr->frameType = ZSTD_skippableFrame;
             return 0;
         }
@@ -394,7 +419,7 @@ static size_t ZSTD_getFrameHeader_internal(ZSTD_frameHeader* zfhPtr, const void*
  *           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
 {
-    return ZSTD_getFrameHeader_internal(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+    return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
 }
 
 
@@ -437,7 +462,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
             size_t skippableSize;
             if (srcSize < ZSTD_skippableHeaderSize)
                 return ERROR(srcSize_wrong);
-            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_frameIdSize)
+            skippableSize = MEM_readLE32((const BYTE *)src + ZSTD_FRAMEIDSIZE)
                           + ZSTD_skippableHeaderSize;
             if (srcSize < skippableSize) {
                 return ZSTD_CONTENTSIZE_ERROR;
@@ -491,7 +516,7 @@ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
 *   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
 static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
 {
-    size_t const result = ZSTD_getFrameHeader_internal(&(dctx->fParams), src, headerSize, dctx->format);
+    size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
     if (ZSTD_isError(result)) return result;    /* invalid header */
     if (result>0) return ERROR(srcSize_wrong);  /* headerSize too small */
     if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
@@ -526,6 +551,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
 static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
                           const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -542,6 +568,9 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
     return regenSize;
 }
 
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize);
 /*! ZSTD_decodeLiteralsBlock() :
  * @return : nb of bytes read from src (< srcSize )
  *  note : symbol not declared but exposed for fullbench */
@@ -558,6 +587,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         case set_repeat:
             if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
             /* fall-through */
+
         case set_compressed:
             if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
             {   size_t lhSize, litSize, litCSize;
@@ -589,15 +619,20 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 if (litSize > ZSTD_BLOCKSIZE_MAX) return ERROR(corruption_detected);
                 if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
 
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
                 if (HUF_isError((litEncType==set_repeat) ?
                                     ( singleStream ?
                                         HUF_decompress1X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) :
                                         HUF_decompress4X_usingDTable_bmi2(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, dctx->bmi2) ) :
                                     ( singleStream ?
-                                        HUF_decompress1X2_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                         dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2) :
+                                        HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                                                                         dctx->workspace, sizeof(dctx->workspace), dctx->bmi2) :
                                         HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                                                                           dctx->entropy.workspace, sizeof(dctx->entropy.workspace), dctx->bmi2))))
+                                                                           dctx->workspace, sizeof(dctx->workspace), dctx->bmi2))))
                     return ERROR(corruption_detected);
 
                 dctx->litPtr = dctx->litBuffer;
@@ -869,7 +904,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
                                  symbolEncodingType_e type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const U32* baseValue, const U32* nbAdditionalBits,
-                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable)
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq)
 {
     switch(type)
     {
@@ -888,6 +924,12 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
         return 0;
     case set_repeat:
         if (!flagRepeatTable) return ERROR(corruption_detected);
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
         return 0;
     case set_compressed :
         {   U32 tableLog;
@@ -933,6 +975,9 @@ static const U32 ML_base[MaxML+1] = {
                     67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
                     0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
 
+/* Hidden delcaration for fullbench */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize);
 
 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                              const void* src, size_t srcSize)
@@ -940,25 +985,25 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
     const BYTE* const istart = (const BYTE* const)src;
     const BYTE* const iend = istart + srcSize;
     const BYTE* ip = istart;
+    int nbSeq;
     DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
 
     /* check */
     if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
 
     /* SeqHead */
-    {   int nbSeq = *ip++;
-        if (!nbSeq) { *nbSeqPtr=0; return 1; }
-        if (nbSeq > 0x7F) {
-            if (nbSeq == 0xFF) {
-                if (ip+2 > iend) return ERROR(srcSize_wrong);
-                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
-            } else {
-                if (ip >= iend) return ERROR(srcSize_wrong);
-                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
-            }
+    nbSeq = *ip++;
+    if (!nbSeq) { *nbSeqPtr=0; return 1; }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            if (ip+2 > iend) return ERROR(srcSize_wrong);
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            if (ip >= iend) return ERROR(srcSize_wrong);
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
         }
-        *nbSeqPtr = nbSeq;
     }
+    *nbSeqPtr = nbSeq;
 
     /* FSE table descriptors */
     if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
@@ -972,7 +1017,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                       LLtype, MaxLL, LLFSELog,
                                                       ip, iend-ip,
                                                       LL_base, LL_bits,
-                                                      LL_defaultDTable, dctx->fseEntropy);
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
             ip += llhSize;
         }
@@ -981,7 +1027,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                       OFtype, MaxOff, OffFSELog,
                                                       ip, iend-ip,
                                                       OF_base, OF_bits,
-                                                      OF_defaultDTable, dctx->fseEntropy);
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
             ip += ofhSize;
         }
@@ -990,12 +1037,23 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                                                       MLtype, MaxML, MLFSELog,
                                                       ip, iend-ip,
                                                       ML_base, ML_bits,
-                                                      ML_defaultDTable, dctx->fseEntropy);
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
             if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
             ip += mlhSize;
         }
     }
 
+    /* prefetch dictionary content */
+    if (dctx->ddictIsCold) {
+        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
+        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
+        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
+        const void* const pStart = (const char*)dctx->dictEnd - pSize;
+        PREFETCH_AREA(pStart, pSize);
+        dctx->ddictIsCold = 0;
+    }
+
     return ip-istart;
 }
 
@@ -1075,7 +1133,7 @@ HINT_INLINE
 size_t ZSTD_execSequence(BYTE* op,
                          BYTE* const oend, seq_t sequence,
                          const BYTE** litPtr, const BYTE* const litLimit,
-                         const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+                         const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
 {
     BYTE* const oLitEnd = op + sequence.litLength;
     size_t const sequenceLength = sequence.litLength + sequence.matchLength;
@@ -1087,7 +1145,7 @@ size_t ZSTD_execSequence(BYTE* op,
     /* check */
     if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
     if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
-    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
 
     /* copy Literals */
     ZSTD_copy8(op, *litPtr);
@@ -1097,11 +1155,11 @@ size_t ZSTD_execSequence(BYTE* op,
     *litPtr = iLitEnd;   /* update for next sequence */
 
     /* copy Match */
-    if (sequence.offset > (size_t)(oLitEnd - base)) {
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
         /* offset beyond prefix -> go into extDict */
-        if (sequence.offset > (size_t)(oLitEnd - vBase))
+        if (sequence.offset > (size_t)(oLitEnd - virtualStart))
             return ERROR(corruption_detected);
-        match = dictEnd + (match - base);
+        match = dictEnd + (match - prefixStart);
         if (match + sequence.matchLength <= dictEnd) {
             memmove(oLitEnd, match, sequence.matchLength);
             return sequenceLength;
@@ -1111,7 +1169,7 @@ size_t ZSTD_execSequence(BYTE* op,
             memmove(oLitEnd, match, length1);
             op = oLitEnd + length1;
             sequence.matchLength -= length1;
-            match = base;
+            match = prefixStart;
             if (op > oend_w || sequence.matchLength < MINMATCH) {
               U32 i;
               for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
@@ -1354,10 +1412,10 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
-    const BYTE* const base = (const BYTE*) (dctx->base);
-    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences");
+    DEBUGLOG(5, "ZSTD_decompressSequences_body");
 
     /* Regen sequences */
     if (nbSeq) {
@@ -1372,14 +1430,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
         for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
             nbSeq--;
             {   seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                 op += oneSeqSize;
         }   }
 
         /* check if reached exact end */
-        DEBUGLOG(5, "ZSTD_decompressSequences: after decode loop, remaining nbSeq : %i", nbSeq);
+        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
         if (nbSeq) return ERROR(corruption_detected);
         /* save reps for next block */
         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
@@ -1498,8 +1556,8 @@ ZSTD_decompressSequencesLong_body(
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
-    const BYTE* const prefixStart = (const BYTE*) (dctx->base);
-    const BYTE* const dictStart = (const BYTE*) (dctx->vBase);
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
 
     /* Regen sequences */
@@ -1662,7 +1720,8 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
     /* isLongOffset must be true if there are long offsets.
      * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
      * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
      */
     ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
@@ -1701,8 +1760,8 @@ static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
 {
     if (dst != dctx->previousDstEnd) {   /* not contiguous */
         dctx->dictEnd = dctx->previousDstEnd;
-        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
-        dctx->base = dst;
+        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+        dctx->prefixStart = dst;
         dctx->previousDstEnd = dst;
     }
 }
@@ -1729,10 +1788,10 @@ ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, siz
 }
 
 
-static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+static size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE value, size_t length)
 {
     if (length > dstCapacity) return ERROR(dstSize_tooSmall);
-    memset(dst, byte, length);
+    memset(dst, value, length);
     return length;
 }
 
@@ -1749,7 +1808,7 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
 #endif
     if ( (srcSize >= ZSTD_skippableHeaderSize)
       && (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START ) {
-        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize);
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE);
     } else {
         const BYTE* ip = (const BYTE*)src;
         const BYTE* const ipstart = ip;
@@ -1783,7 +1842,6 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
         if (zfh.checksumFlag) {   /* Final frame content checksum */
             if (remainingSize < 4) return ERROR(srcSize_wrong);
             ip += 4;
-            remainingSize -= 4;
         }
 
         return ip - ipstart;
@@ -1871,9 +1929,6 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
     return op-ostart;
 }
 
-static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
-static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
-
 static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                         void* dst, size_t dstCapacity,
                                   const void* src, size_t srcSize,
@@ -1881,6 +1936,9 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                   const ZSTD_DDict* ddict)
 {
     void* const dststart = dst;
+    int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
     assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
 
     if (ddict) {
@@ -1889,7 +1947,6 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
     }
 
     while (srcSize >= ZSTD_frameHeaderSize_prefix) {
-        U32 magicNumber;
 
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
         if (ZSTD_isLegacy(src, srcSize)) {
@@ -1911,24 +1968,21 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
         }
 #endif
 
-        magicNumber = MEM_readLE32(src);
-        DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-                    (U32)magicNumber, (U32)ZSTD_MAGICNUMBER);
-        if (magicNumber != ZSTD_MAGICNUMBER) {
+        {   U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+                        (U32)magicNumber, (U32)ZSTD_MAGICNUMBER);
             if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
                 size_t skippableSize;
                 if (srcSize < ZSTD_skippableHeaderSize)
                     return ERROR(srcSize_wrong);
-                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_frameIdSize)
+                skippableSize = MEM_readLE32((const BYTE*)src + ZSTD_FRAMEIDSIZE)
                               + ZSTD_skippableHeaderSize;
                 if (srcSize < skippableSize) return ERROR(srcSize_wrong);
 
                 src = (const BYTE *)src + skippableSize;
                 srcSize -= skippableSize;
                 continue;
-            }
-            return ERROR(prefix_unknown);
-        }
+        }   }
 
         if (ddict) {
             /* we were called from ZSTD_decompress_usingDDict */
@@ -1942,11 +1996,25 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
 
         {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
                                                     &src, &srcSize);
+            if ( (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+              && (moreThan1Frame==1) ) {
+                /* at least one frame successfully completed,
+                 * but following bytes are garbage :
+                 * it's more likely to be a srcSize error,
+                 * specifying more bytes than compressed size of frame(s).
+                 * This error message replaces ERROR(prefix_unknown),
+                 * which would be confusing, as the first header is actually correct.
+                 * Note that one could be unlucky, it might be a corruption error instead,
+                 * happening right at the place where we expect zstd magic bytes.
+                 * But this is _much_ less likely than a srcSize field error. */
+                return ERROR(srcSize_wrong);
+            }
             if (ZSTD_isError(res)) return res;
             /* no need to bound check, ZSTD_decompressFrame already has */
             dst = (BYTE*)dst + res;
             dstCapacity -= res;
         }
+        moreThan1Frame = 1;
     }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
 
     if (srcSize) return ERROR(srcSize_wrong); /* input not entirely consumed */
@@ -1980,6 +2048,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
     return regenSize;
 #else   /* stack mode */
     ZSTD_DCtx dctx;
+    ZSTD_initDCtx_internal(&dctx);
     return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
 #endif
 }
@@ -2031,7 +2100,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
     case ZSTDds_getFrameHeaderSize :
         assert(src != NULL);
         if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
-            assert(srcSize >= ZSTD_frameIdSize);  /* to read skippable magic number */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
             if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
                 memcpy(dctx->headerBuffer, src, srcSize);
                 dctx->expected = ZSTD_skippableHeaderSize - srcSize;  /* remaining to load to get full skippable frame header */
@@ -2141,7 +2210,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
         assert(src != NULL);
         assert(srcSize <= ZSTD_skippableHeaderSize);
         memcpy(dctx->headerBuffer + (ZSTD_skippableHeaderSize - srcSize), src, srcSize);   /* complete skippable header */
-        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_frameIdSize);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
         dctx->stage = ZSTDds_skipFrame;
         return 0;
 
@@ -2159,27 +2228,33 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
 static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
 {
     dctx->dictEnd = dctx->previousDstEnd;
-    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
-    dctx->base = dict;
+    dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+    dctx->prefixStart = dict;
     dctx->previousDstEnd = (const char*)dict + dictSize;
     return 0;
 }
 
-/* ZSTD_loadEntropy() :
- * dict : must point at beginning of a valid zstd dictionary
+/*! ZSTD_loadEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
  * @return : size of entropy tables read */
-static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize)
+static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy,
+                         const void* const dict, size_t const dictSize)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
 
     if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
     dictPtr += 8;   /* skip header = magic + dictID */
 
-
-    {   size_t const hSize = HUF_readDTableX4_wksp(
-            entropy->hufTable, dictPtr, dictEnd - dictPtr,
-            entropy->workspace, sizeof(entropy->workspace));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
         if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
         dictPtr += hSize;
     }
@@ -2190,7 +2265,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeMaxValue > MaxOff) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->OFTable,
+        ZSTD_buildFSETable( entropy->OFTable,
                             offcodeNCount, offcodeMaxValue,
                             OF_base, OF_bits,
                             offcodeLog);
@@ -2203,7 +2278,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
         if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (matchlengthMaxValue > MaxML) return ERROR(dictionary_corrupted);
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->MLTable,
+        ZSTD_buildFSETable( entropy->MLTable,
                             matchlengthNCount, matchlengthMaxValue,
                             ML_base, ML_bits,
                             matchlengthLog);
@@ -2216,7 +2291,7 @@ static size_t ZSTD_loadEntropy(ZSTD_entropyDTables_t* entropy, const void* const
         if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (litlengthMaxValue > MaxLL) return ERROR(dictionary_corrupted);
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
-        ZSTD_buildFSETable(entropy->LLTable,
+        ZSTD_buildFSETable( entropy->LLTable,
                             litlengthNCount, litlengthMaxValue,
                             LL_base, LL_bits,
                             litlengthLog);
@@ -2242,7 +2317,7 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict
         if (magic != ZSTD_MAGIC_DICTIONARY) {
             return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
     }   }
-    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
     {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
@@ -2256,7 +2331,6 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict
     return ZSTD_refDictContent(dctx, dict, dictSize);
 }
 
-/* Note : this function cannot fail */
 size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
 {
     assert(dctx != NULL);
@@ -2264,8 +2338,8 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
     dctx->stage = ZSTDds_getFrameHeaderSize;
     dctx->decodedSize = 0;
     dctx->previousDstEnd = NULL;
-    dctx->base = NULL;
-    dctx->vBase = NULL;
+    dctx->prefixStart = NULL;
+    dctx->virtualStart = NULL;
     dctx->dictEnd = NULL;
     dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
     dctx->litEntropy = dctx->fseEntropy = 0;
@@ -2302,42 +2376,53 @@ struct ZSTD_DDict_s {
 
 static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictContent;
 }
 
 static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
 {
+    assert(ddict != NULL);
     return ddict->dictSize;
 }
 
-size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
 {
-    CHECK_F( ZSTD_decompressBegin(dstDCtx) );
-    if (ddict) {   /* support begin on NULL */
-        dstDCtx->dictID = ddict->dictID;
-        dstDCtx->base = ddict->dictContent;
-        dstDCtx->vBase = ddict->dictContent;
-        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
-        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        dctx->ddictIsCold = (dctx->dictEnd != (const char*)ddict->dictContent + ddict->dictSize);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    CHECK_F( ZSTD_decompressBegin(dctx) );
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        dctx->dictID = ddict->dictID;
+        dctx->prefixStart = ddict->dictContent;
+        dctx->virtualStart = ddict->dictContent;
+        dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dctx->previousDstEnd = dctx->dictEnd;
         if (ddict->entropyPresent) {
-            dstDCtx->litEntropy = 1;
-            dstDCtx->fseEntropy = 1;
-            dstDCtx->LLTptr = ddict->entropy.LLTable;
-            dstDCtx->MLTptr = ddict->entropy.MLTable;
-            dstDCtx->OFTptr = ddict->entropy.OFTable;
-            dstDCtx->HUFptr = ddict->entropy.hufTable;
-            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
-            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
-            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+            dctx->litEntropy = 1;
+            dctx->fseEntropy = 1;
+            dctx->LLTptr = ddict->entropy.LLTable;
+            dctx->MLTptr = ddict->entropy.MLTable;
+            dctx->OFTptr = ddict->entropy.OFTable;
+            dctx->HUFptr = ddict->entropy.hufTable;
+            dctx->entropy.rep[0] = ddict->entropy.rep[0];
+            dctx->entropy.rep[1] = ddict->entropy.rep[1];
+            dctx->entropy.rep[2] = ddict->entropy.rep[2];
         } else {
-            dstDCtx->litEntropy = 0;
-            dstDCtx->fseEntropy = 0;
+            dctx->litEntropy = 0;
+            dctx->fseEntropy = 0;
         }
     }
     return 0;
 }
 
-static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType)
+static size_t
+ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict,
+                         ZSTD_dictContentType_e dictContentType)
 {
     ddict->dictID = 0;
     ddict->entropyPresent = 0;
@@ -2355,10 +2440,12 @@ static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e
             return 0;   /* pure content mode */
         }
     }
-    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_frameIdSize);
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
 
     /* load entropy tables */
-    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy,
+                              ddict->dictContent, ddict->dictSize),
+             dictionary_corrupted );
     ddict->entropyPresent = 1;
     return 0;
 }
@@ -2372,6 +2459,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
         ddict->dictBuffer = NULL;
         ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
     } else {
         void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
         ddict->dictBuffer = internalBuffer;
@@ -2396,14 +2484,15 @@ ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
     if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
 
     {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
-        if (!ddict) return NULL;
+        if (ddict == NULL) return NULL;
         ddict->cMem = customMem;
-
-        if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType) )) {
-            ZSTD_freeDDict(ddict);
-            return NULL;
-        }
-
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
         return ddict;
     }
 }
@@ -2430,23 +2519,25 @@ ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize
 
 
 const ZSTD_DDict* ZSTD_initStaticDDict(
-                                void* workspace, size_t workspaceSize,
+                                void* sBuffer, size_t sBufferSize,
                                 const void* dict, size_t dictSize,
                                 ZSTD_dictLoadMethod_e dictLoadMethod,
                                 ZSTD_dictContentType_e dictContentType)
 {
-    size_t const neededSpace =
-            sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
-    ZSTD_DDict* const ddict = (ZSTD_DDict*)workspace;
-    assert(workspace != NULL);
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
     assert(dict != NULL);
-    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
-    if (workspaceSize < neededSpace) return NULL;
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
     if (dictLoadMethod == ZSTD_dlm_byCopy) {
         memcpy(ddict+1, dict, dictSize);  /* local copy */
         dict = ddict+1;
     }
-    if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) ))
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
         return NULL;
     return ddict;
 }
@@ -2484,7 +2575,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
 {
     if (dictSize < 8) return 0;
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
-    return MEM_readLE32((const char*)dict + ZSTD_frameIdSize);
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
 }
 
 /*! ZSTD_getDictID_fromDDict() :
@@ -2560,12 +2651,15 @@ size_t ZSTD_freeDStream(ZSTD_DStream* zds)
 }
 
 
-/* *** Initialization *** */
+/* ***  Initialization  *** */
 
 size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
 size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
 
-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
     ZSTD_freeDDict(dctx->ddictLocal);
@@ -2607,6 +2701,7 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
 {
     DEBUGLOG(4, "ZSTD_initDStream_usingDict");
     zds->streamStage = zdss_init;
+    zds->noForwardProgress = 0;
     CHECK_F( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) );
     return ZSTD_frameHeaderSize_prefix;
 }
@@ -2618,13 +2713,6 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
     return ZSTD_initDStream_usingDict(zds, NULL, 0);
 }
 
-size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
-{
-    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
-    dctx->ddict = ddict;
-    return 0;
-}
-
 /* ZSTD_initDStream_usingDDict() :
  * ddict will just be referenced, and must outlive decompression session
  * this function cannot fail */
@@ -2663,6 +2751,13 @@ size_t ZSTD_setDStreamParameter(ZSTD_DStream* dctx,
     return 0;
 }
 
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
+    dctx->ddict = ddict;
+    return 0;
+}
+
 size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
 {
     if (dctx->streamStage != zdss_init) return ERROR(stage_wrong);
@@ -2767,7 +2862,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                     return hint;
             }   }
 #endif
-            {   size_t const hSize = ZSTD_getFrameHeader_internal(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+            {   size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
                 DEBUGLOG(5, "header size : %u", (U32)hSize);
                 if (ZSTD_isError(hSize)) {
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
@@ -2828,7 +2923,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             CHECK_F(ZSTD_decompressBegin_usingDDict(zds, zds->ddict));
 
             if ((MEM_readLE32(zds->headerBuffer) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
-                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_frameIdSize);
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                 zds->stage = ZSTDds_skipFrame;
             } else {
                 CHECK_F(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize));
@@ -2947,8 +3042,18 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
     }   }
 
     /* result */
-    input->pos += (size_t)(ip-istart);
-    output->pos += (size_t)(op-ostart);
+    input->pos = (size_t)(ip - (const char*)(input->src));
+    output->pos = (size_t)(op - (char*)(output->dst));
+    if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+        zds->noForwardProgress ++;
+        if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+            if (op==oend) return ERROR(dstSize_tooSmall);
+            if (ip==iend) return ERROR(srcSize_wrong);
+            assert(0);
+        }
+    } else {
+        zds->noForwardProgress = 0;
+    }
     {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
         if (!nextSrcSizeHint) {   /* frame fully decoded */
             if (zds->outEnd == zds->outStart) {  /* output fully flushed */
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index b5a3957a9b96..6b4af69d29c5 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -29,6 +29,7 @@
 #include "mem.h" /* read */
 #include "pool.h"
 #include "threading.h"
+#include "cover.h"
 #include "zstd_internal.h" /* includes zstd.h */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
@@ -39,6 +40,7 @@
 *  Constants
 ***************************************/
 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define DEFAULT_SPLITPOINT 1.0
 
 /*-*************************************
 *  Console display
@@ -184,7 +186,7 @@ static void COVER_map_remove(COVER_map_t *map, U32 key) {
 }
 
 /**
- * Destroyes a map that is inited with COVER_map_init().
+ * Destroys a map that is inited with COVER_map_init().
  */
 static void COVER_map_destroy(COVER_map_t *map) {
   if (map->data) {
@@ -203,6 +205,8 @@ typedef struct {
   size_t *offsets;
   const size_t *samplesSizes;
   size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
   U32 *suffix;
   size_t suffixSize;
   U32 *freqs;
@@ -220,9 +224,9 @@ static COVER_ctx_t *g_ctx = NULL;
 /**
  * Returns the sum of the sample sizes.
  */
-static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
   size_t sum = 0;
-  size_t i;
+  unsigned i;
   for (i = 0; i < nbSamples; ++i) {
     sum += samplesSizes[i];
   }
@@ -377,14 +381,6 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
   ctx->suffix[dmerId] = freq;
 }
 
-/**
- * A segment is a range in the source as well as the score of the segment.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-  U32 score;
-} COVER_segment_t;
 
 /**
  * Selects the best segment in an epoch.
@@ -494,6 +490,10 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters,
   if (parameters.d > parameters.k) {
     return 0;
   }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
+    return 0;
+  }
   return 1;
 }
 
@@ -531,9 +531,14 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
  */
 static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
                           const size_t *samplesSizes, unsigned nbSamples,
-                          unsigned d) {
+                          unsigned d, double splitPoint) {
   const BYTE *const samples = (const BYTE *)samplesBuffer;
   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+  /* Split samples into testing and training sets */
+  const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+  const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+  const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+  const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
   /* Checks */
   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
@@ -541,15 +546,29 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
                  (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
     return 0;
   }
+  /* Check if there are at least 5 training samples */
+  if (nbTrainSamples < 5) {
+    DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
+    return 0;
+  }
+  /* Check if there's testing sample */
+  if (nbTestSamples < 1) {
+    DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
+    return 0;
+  }
   /* Zero the context */
   memset(ctx, 0, sizeof(*ctx));
-  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples,
-               (U32)totalSamplesSize);
+  DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+               (U32)trainingSamplesSize);
+  DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+               (U32)testSamplesSize);
   ctx->samples = samples;
   ctx->samplesSizes = samplesSizes;
   ctx->nbSamples = nbSamples;
+  ctx->nbTrainSamples = nbTrainSamples;
+  ctx->nbTestSamples = nbTestSamples;
   /* Partial suffix array */
-  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
+  ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
   /* Maps index to the dmerID */
   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@@ -563,7 +582,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
   ctx->freqs = NULL;
   ctx->d = d;
 
-  /* Fill offsets from the samlesSizes */
+  /* Fill offsets from the samplesSizes */
   {
     U32 i;
     ctx->offsets[0] = 0;
@@ -581,10 +600,17 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
     for (i = 0; i < ctx->suffixSize; ++i) {
       ctx->suffix[i] = i;
     }
-    /* qsort doesn't take an opaque pointer, so pass as a global */
+    /* qsort doesn't take an opaque pointer, so pass as a global.
+     * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
+     */
     g_ctx = ctx;
+#if defined(__OpenBSD__)
+    mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#else
     qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
+#endif
   }
   DISPLAYLEVEL(2, "Computing frequencies\n");
   /* For each dmer group (group of positions with the same first d bytes):
@@ -613,7 +639,7 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
   /* Divide the data up into epochs of equal size.
    * We will select at least one segment from each epoch.
    */
-  const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
   const U32 epochSize = (U32)(ctx->suffixSize / epochs);
   size_t epoch;
   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
@@ -658,7 +684,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
   BYTE* const dict = (BYTE*)dictBuffer;
   COVER_ctx_t ctx;
   COVER_map_t activeDmers;
-
+  parameters.splitPoint = 1.0;
   /* Initialize global data */
   g_displayLevel = parameters.zParams.notificationLevel;
   /* Checks */
@@ -677,7 +703,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
   }
   /* Initialize context and activeDmers */
   if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
-                      parameters.d)) {
+                      parameters.d, parameters.splitPoint)) {
     return ERROR(GENERIC);
   }
   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
@@ -704,28 +730,65 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
   }
 }
 
-/**
- * COVER_best_t is used for two purposes:
- * 1. Synchronizing threads.
- * 2. Saving the best parameters and dictionary.
- *
- * All of the methods except COVER_best_init() are thread safe if zstd is
- * compiled with multithreaded support.
- */
-typedef struct COVER_best_s {
-  ZSTD_pthread_mutex_t mutex;
-  ZSTD_pthread_cond_t cond;
-  size_t liveJobs;
-  void *dict;
-  size_t dictSize;
-  ZDICT_cover_params_t parameters;
-  size_t compressedSize;
-} COVER_best_t;
+
+
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                    const size_t *samplesSizes, const BYTE *samples,
+                                    size_t *offsets,
+                                    size_t nbTrainSamples, size_t nbSamples,
+                                    BYTE *const dict, size_t dictBufferCapacity) {
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Pointers */
+  ZSTD_CCtx *cctx;
+  ZSTD_CDict *cdict;
+  void *dst;
+  /* Local variables */
+  size_t dstCapacity;
+  size_t i;
+  /* Allocate dst with enough space to compress the maximum sized sample */
+  {
+    size_t maxSampleSize = 0;
+    i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+    for (; i < nbSamples; ++i) {
+      maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
+    }
+    dstCapacity = ZSTD_compressBound(maxSampleSize);
+    dst = malloc(dstCapacity);
+  }
+  /* Create the cctx and cdict */
+  cctx = ZSTD_createCCtx();
+  cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                           parameters.zParams.compressionLevel);
+  if (!dst || !cctx || !cdict) {
+    goto _compressCleanup;
+  }
+  /* Compress each sample and sum their sizes (or error) */
+  totalCompressedSize = dictBufferCapacity;
+  i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
+  for (; i < nbSamples; ++i) {
+    const size_t size = ZSTD_compress_usingCDict(
+        cctx, dst, dstCapacity, samples + offsets[i],
+        samplesSizes[i], cdict);
+    if (ZSTD_isError(size)) {
+      totalCompressedSize = ERROR(GENERIC);
+      goto _compressCleanup;
+    }
+    totalCompressedSize += size;
+  }
+_compressCleanup:
+  ZSTD_freeCCtx(cctx);
+  ZSTD_freeCDict(cdict);
+  if (dst) {
+    free(dst);
+  }
+  return totalCompressedSize;
+}
+
 
 /**
  * Initialize the `COVER_best_t`.
  */
-static void COVER_best_init(COVER_best_t *best) {
+void COVER_best_init(COVER_best_t *best) {
   if (best==NULL) return; /* compatible with init on NULL */
   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
@@ -739,7 +802,7 @@ static void COVER_best_init(COVER_best_t *best) {
 /**
  * Wait until liveJobs == 0.
  */
-static void COVER_best_wait(COVER_best_t *best) {
+void COVER_best_wait(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -753,7 +816,7 @@ static void COVER_best_wait(COVER_best_t *best) {
 /**
  * Call COVER_best_wait() and then destroy the COVER_best_t.
  */
-static void COVER_best_destroy(COVER_best_t *best) {
+void COVER_best_destroy(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -769,7 +832,7 @@ static void COVER_best_destroy(COVER_best_t *best) {
  * Called when a thread is about to be launched.
  * Increments liveJobs.
  */
-static void COVER_best_start(COVER_best_t *best) {
+void COVER_best_start(COVER_best_t *best) {
   if (!best) {
     return;
   }
@@ -783,7 +846,7 @@ static void COVER_best_start(COVER_best_t *best) {
  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  * If this dictionary is the best so far save it and its parameters.
  */
-static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
                               ZDICT_cover_params_t parameters, void *dict,
                               size_t dictSize) {
   if (!best) {
@@ -814,10 +877,10 @@ static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
       best->parameters = parameters;
       best->compressedSize = compressedSize;
     }
-    ZSTD_pthread_mutex_unlock(&best->mutex);
     if (liveJobs == 0) {
       ZSTD_pthread_cond_broadcast(&best->cond);
     }
+    ZSTD_pthread_mutex_unlock(&best->mutex);
   }
 }
 
@@ -832,7 +895,7 @@ typedef struct COVER_tryParameters_data_s {
 } COVER_tryParameters_data_t;
 
 /**
- * Tries a set of parameters and upates the COVER_best_t with the results.
+ * Tries a set of parameters and updates the COVER_best_t with the results.
  * This function is thread safe if zstd is compiled with multithreaded support.
  * It takes its parameters as an *OWNING* opaque pointer to support threading.
  */
@@ -863,7 +926,7 @@ static void COVER_tryParameters(void *opaque) {
                                               dictBufferCapacity, parameters);
     dictBufferCapacity = ZDICT_finalizeDictionary(
         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
         parameters.zParams);
     if (ZDICT_isError(dictBufferCapacity)) {
       DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
@@ -871,49 +934,10 @@ static void COVER_tryParameters(void *opaque) {
     }
   }
   /* Check total compressed size */
-  {
-    /* Pointers */
-    ZSTD_CCtx *cctx;
-    ZSTD_CDict *cdict;
-    void *dst;
-    /* Local variables */
-    size_t dstCapacity;
-    size_t i;
-    /* Allocate dst with enough space to compress the maximum sized sample */
-    {
-      size_t maxSampleSize = 0;
-      for (i = 0; i < ctx->nbSamples; ++i) {
-        maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
-      }
-      dstCapacity = ZSTD_compressBound(maxSampleSize);
-      dst = malloc(dstCapacity);
-    }
-    /* Create the cctx and cdict */
-    cctx = ZSTD_createCCtx();
-    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
-                             parameters.zParams.compressionLevel);
-    if (!dst || !cctx || !cdict) {
-      goto _compressCleanup;
-    }
-    /* Compress each sample and sum their sizes (or error) */
-    totalCompressedSize = dictBufferCapacity;
-    for (i = 0; i < ctx->nbSamples; ++i) {
-      const size_t size = ZSTD_compress_usingCDict(
-          cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
-          ctx->samplesSizes[i], cdict);
-      if (ZSTD_isError(size)) {
-        totalCompressedSize = ERROR(GENERIC);
-        goto _compressCleanup;
-      }
-      totalCompressedSize += size;
-    }
-  _compressCleanup:
-    ZSTD_freeCCtx(cctx);
-    ZSTD_freeCDict(cdict);
-    if (dst) {
-      free(dst);
-    }
-  }
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
 
 _cleanup:
   COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
@@ -934,6 +958,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
     ZDICT_cover_params_t *parameters) {
   /* constants */
   const unsigned nbThreads = parameters->nbThreads;
+  const double splitPoint =
+      parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
   const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
   const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
@@ -951,6 +977,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
   POOL_ctx *pool = NULL;
 
   /* Checks */
+  if (splitPoint <= 0 || splitPoint > 1) {
+    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+    return ERROR(GENERIC);
+  }
   if (kMinK < kMaxD || kMaxK < kMinK) {
     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
     return ERROR(GENERIC);
@@ -981,7 +1011,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
     /* Initialize the context for this value of d */
     COVER_ctx_t ctx;
     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
-    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) {
+    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
       LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
       COVER_best_destroy(&best);
       POOL_free(pool);
@@ -1006,6 +1036,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
       data->parameters = *parameters;
       data->parameters.k = k;
       data->parameters.d = d;
+      data->parameters.splitPoint = splitPoint;
       data->parameters.steps = kSteps;
       data->parameters.zParams.notificationLevel = g_displayLevel;
       /* Check the parameters */
diff --git a/lib/dictBuilder/cover.h b/lib/dictBuilder/cover.h
new file mode 100644
index 000000000000..82e2e1cea43c
--- /dev/null
+++ b/lib/dictBuilder/cover.h
@@ -0,0 +1,83 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+/**
+ * COVER_best_t is used for two purposes:
+ * 1. Synchronizing threads.
+ * 2. Saving the best parameters and dictionary.
+ *
+ * All of the methods except COVER_best_init() are thread safe if zstd is
+ * compiled with multithreaded support.
+ */
+typedef struct COVER_best_s {
+  ZSTD_pthread_mutex_t mutex;
+  ZSTD_pthread_cond_t cond;
+  size_t liveJobs;
+  void *dict;
+  size_t dictSize;
+  ZDICT_cover_params_t parameters;
+  size_t compressedSize;
+} COVER_best_t;
+
+/**
+ * A segment is a range in the source as well as the score of the segment.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+  U32 score;
+} COVER_segment_t;
+
+/**
+ *  Checks total compressed size of a dictionary
+ */
+size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
+                                      const size_t *samplesSizes, const BYTE *samples,
+                                      size_t *offsets,
+                                      size_t nbTrainSamples, size_t nbSamples,
+                                      BYTE *const dict, size_t dictBufferCapacity);
+
+/**
+ * Returns the sum of the sample sizes.
+ */
+size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
+
+/**
+ * Initialize the `COVER_best_t`.
+ */
+void COVER_best_init(COVER_best_t *best);
+
+/**
+ * Wait until liveJobs == 0.
+ */
+void COVER_best_wait(COVER_best_t *best);
+
+/**
+ * Call COVER_best_wait() and then destroy the COVER_best_t.
+ */
+void COVER_best_destroy(COVER_best_t *best);
+
+/**
+ * Called when a thread is about to be launched.
+ * Increments liveJobs.
+ */
+void COVER_best_start(COVER_best_t *best);
+
+/**
+ * Called when a thread finishes executing, both on error or success.
+ * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
+ * If this dictionary is the best so far save it and its parameters.
+ */
+void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
+                       ZDICT_cover_params_t parameters, void *dict,
+                       size_t dictSize);
diff --git a/lib/dictBuilder/divsufsort.c b/lib/dictBuilder/divsufsort.c
index 60cceb088321..ead9220442b2 100644
--- a/lib/dictBuilder/divsufsort.c
+++ b/lib/dictBuilder/divsufsort.c
@@ -1637,7 +1637,7 @@ construct_SA(const unsigned char *T, int *SA,
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else {
           assert(((s == 0) && (T[s] == c1)) || (s < 0));
@@ -1701,7 +1701,7 @@ construct_BWT(const unsigned char *T, int *SA,
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
@@ -1785,7 +1785,7 @@ construct_BWT_indexes(const unsigned char *T, int *SA,
             if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
             k = SA + BUCKET_B(c2 = c0, c1);
           }
-          assert(k < j);
+          assert(k < j); assert(k != NULL);
           *k-- = s;
         } else if(s != 0) {
           *j = ~s;
diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c
new file mode 100644
index 000000000000..dfee45743413
--- /dev/null
+++ b/lib/dictBuilder/fastcover.c
@@ -0,0 +1,728 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+
+#include "mem.h" /* read */
+#include "pool.h"
+#include "threading.h"
+#include "cover.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define FASTCOVER_MAX_F 31
+#define FASTCOVER_MAX_ACCEL 10
+#define DEFAULT_SPLITPOINT 0.75
+#define DEFAULT_F 20
+#define DEFAULT_ACCEL 1
+
+
+/*-*************************************
+*  Console display
+***************************************/
+static int g_displayLevel = 2;
+#define DISPLAY(...)                                                           \
+  {                                                                            \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+  }
+#define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
+  if (displayLevel >= l) {                                                     \
+    DISPLAY(__VA_ARGS__);                                                      \
+  } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
+#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
+
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
+
+
+/*-*************************************
+* Hash Functions
+***************************************/
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+
+/**
+ * Hash the d-byte value pointed to by p and mod 2^f
+ */
+static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) {
+  if (d == 6) {
+    return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1);
+  }
+  return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1);
+}
+
+
+/*-*************************************
+* Acceleration
+***************************************/
+typedef struct {
+  unsigned finalize;    /* Percentage of training samples used for ZDICT_finalizeDictionary */
+  unsigned skip;        /* Number of dmer skipped between each dmer counted in computeFrequency */
+} FASTCOVER_accel_t;
+
+
+static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
+  { 100, 0 },   /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
+  { 100, 0 },   /* accel = 1 */
+  { 50, 1 },   /* accel = 2 */
+  { 34, 2 },   /* accel = 3 */
+  { 25, 3 },   /* accel = 4 */
+  { 20, 4 },   /* accel = 5 */
+  { 17, 5 },   /* accel = 6 */
+  { 14, 6 },   /* accel = 7 */
+  { 13, 7 },   /* accel = 8 */
+  { 11, 8 },   /* accel = 9 */
+  { 10, 9 },   /* accel = 10 */
+};
+
+
+/*-*************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  size_t nbTrainSamples;
+  size_t nbTestSamples;
+  size_t nbDmers;
+  U32 *freqs;
+  unsigned d;
+  unsigned f;
+  FASTCOVER_accel_t accelParams;
+} FASTCOVER_ctx_t;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/**
+ * Selects the best segment in an epoch.
+ * Segments of are scored according to the function:
+ *
+ * Let F(d) be the frequency of all dmers with hash value d.
+ * Let S_i be hash value of the dmer at position i of segment S which has length k.
+ *
+ *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
+ *
+ * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ */
+static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
+                                              U32 *freqs, U32 begin, U32 end,
+                                              ZDICT_cover_params_t parameters,
+                                              U16* segmentFreqs) {
+  /* Constants */
+  const U32 k = parameters.k;
+  const U32 d = parameters.d;
+  const U32 f = ctx->f;
+  const U32 dmersInK = k - d + 1;
+
+  /* Try each segment (activeSegment) and save the best (bestSegment) */
+  COVER_segment_t bestSegment = {0, 0, 0};
+  COVER_segment_t activeSegment;
+
+  /* Reset the activeDmers in the segment */
+  /* The activeSegment starts at the beginning of the epoch. */
+  activeSegment.begin = begin;
+  activeSegment.end = begin;
+  activeSegment.score = 0;
+
+  /* Slide the activeSegment through the whole epoch.
+   * Save the best segment in bestSegment.
+   */
+  while (activeSegment.end < end) {
+    /* Get hash value of current dmer */
+    const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
+
+    /* Add frequency of this index to score if this is the first occurence of index in active segment */
+    if (segmentFreqs[index] == 0) {
+      activeSegment.score += freqs[index];
+    }
+    /* Increment end of segment and segmentFreqs*/
+    activeSegment.end += 1;
+    segmentFreqs[index] += 1;
+    /* If the window is now too large, drop the first position */
+    if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
+      /* Get hash value of the dmer to be eliminated from active segment */
+      const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+      segmentFreqs[delIndex] -= 1;
+      /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
+      if (segmentFreqs[delIndex] == 0) {
+        activeSegment.score -= freqs[delIndex];
+      }
+      /* Increment start of segment */
+      activeSegment.begin += 1;
+    }
+
+    /* If this segment is the best so far save it */
+    if (activeSegment.score > bestSegment.score) {
+      bestSegment = activeSegment;
+    }
+  }
+
+  /* Zero out rest of segmentFreqs array */
+  while (activeSegment.begin < end) {
+    const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
+    segmentFreqs[delIndex] -= 1;
+    activeSegment.begin += 1;
+  }
+
+  {
+    /*  Zero the frequency of hash value of each dmer covered by the chosen segment. */
+    U32 pos;
+    for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
+      const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
+      freqs[i] = 0;
+    }
+  }
+
+  return bestSegment;
+}
+
+
+static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
+                                     size_t maxDictSize, unsigned f,
+                                     unsigned accel) {
+  /* k, d, and f are required parameters */
+  if (parameters.d == 0 || parameters.k == 0) {
+    return 0;
+  }
+  /* d has to be 6 or 8 */
+  if (parameters.d != 6 && parameters.d != 8) {
+    return 0;
+  }
+  /* k <= maxDictSize */
+  if (parameters.k > maxDictSize) {
+    return 0;
+  }
+  /* d <= k */
+  if (parameters.d > parameters.k) {
+    return 0;
+  }
+  /* 0 < f <= FASTCOVER_MAX_F*/
+  if (f > FASTCOVER_MAX_F || f == 0) {
+    return 0;
+  }
+  /* 0 < splitPoint <= 1 */
+  if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
+    return 0;
+  }
+  /* 0 < accel <= 10 */
+  if (accel > 10 || accel == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `FASTCOVER_ctx_init()`.
+ */
+static void
+FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
+{
+    if (!ctx) return;
+
+    free(ctx->freqs);
+    ctx->freqs = NULL;
+
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+}
+
+
+/**
+ * Calculate for frequency of hash value of each dmer in ctx->samples
+ */
+static void
+FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
+{
+    const unsigned f = ctx->f;
+    const unsigned d = ctx->d;
+    const unsigned skip = ctx->accelParams.skip;
+    const unsigned readLength = MAX(d, 8);
+    size_t i;
+    assert(ctx->nbTrainSamples >= 5);
+    assert(ctx->nbTrainSamples <= ctx->nbSamples);
+    for (i = 0; i < ctx->nbTrainSamples; i++) {
+        size_t start = ctx->offsets[i];  /* start of current dmer */
+        size_t const currSampleEnd = ctx->offsets[i+1];
+        while (start + readLength <= currSampleEnd) {
+            const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
+            freqs[dmerIndex]++;
+            start = start + skip + 1;
+        }
+    }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * The context is only dependent on the parameter `d` and can used multiple
+ * times.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `FASTCOVER_ctx_destroy()`.
+ */
+static int
+FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
+                   const void* samplesBuffer,
+                   const size_t* samplesSizes, unsigned nbSamples,
+                   unsigned d, double splitPoint, unsigned f,
+                   FASTCOVER_accel_t accelParams)
+{
+    const BYTE* const samples = (const BYTE*)samplesBuffer;
+    const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
+    /* Split samples into testing and training sets */
+    const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
+    const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
+    const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
+    const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
+
+    /* Checks */
+    if (totalSamplesSize < MAX(d, sizeof(U64)) ||
+        totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
+        DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                    (U32)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
+        return 0;
+    }
+
+    /* Check if there are at least 5 training samples */
+    if (nbTrainSamples < 5) {
+        DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
+        return 0;
+    }
+
+    /* Check if there's testing sample */
+    if (nbTestSamples < 1) {
+        DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
+        return 0;
+    }
+
+    /* Zero the context */
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
+                    (U32)trainingSamplesSize);
+    DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
+                    (U32)testSamplesSize);
+
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->nbTrainSamples = nbTrainSamples;
+    ctx->nbTestSamples = nbTestSamples;
+    ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
+    ctx->d = d;
+    ctx->f = f;
+    ctx->accelParams = accelParams;
+
+    /* The offsets of each file */
+    ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
+    if (ctx->offsets == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return 0;
+    }
+
+    /* Fill offsets from the samplesSizes */
+    {   U32 i;
+        ctx->offsets[0] = 0;
+        assert(nbSamples >= 5);
+        for (i = 1; i <= nbSamples; ++i) {
+            ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+        }
+    }
+
+    /* Initialize frequency array of size 2^f */
+    ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
+    if (ctx->freqs == NULL) {
+        DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
+        FASTCOVER_ctx_destroy(ctx);
+        return 0;
+    }
+
+    DISPLAYLEVEL(2, "Computing frequencies\n");
+    FASTCOVER_computeFrequency(ctx->freqs, ctx);
+
+    return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t
+FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
+                          U32* freqs,
+                          void* dictBuffer, size_t dictBufferCapacity,
+                          ZDICT_cover_params_t parameters,
+                          U16* segmentFreqs)
+{
+  BYTE *const dict = (BYTE *)dictBuffer;
+  size_t tail = dictBufferCapacity;
+  /* Divide the data up into epochs of equal size.
+   * We will select at least one segment from each epoch.
+   */
+  const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
+  const U32 epochSize = (U32)(ctx->nbDmers / epochs);
+  size_t epoch;
+  DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
+               epochSize);
+  /* Loop through the epochs until there are no more segments or the dictionary
+   * is full.
+   */
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
+    const U32 epochBegin = (U32)(epoch * epochSize);
+    const U32 epochEnd = epochBegin + epochSize;
+    size_t segmentSize;
+    /* Select a segment */
+    COVER_segment_t segment = FASTCOVER_selectSegment(
+        ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
+
+    /* If the segment covers no dmers, then we are out of content */
+    if (segment.score == 0) {
+      break;
+    }
+
+    /* Trim the segment if necessary and if it is too small then we are done */
+    segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
+    if (segmentSize < parameters.d) {
+      break;
+    }
+
+    /* We fill the dictionary from the back to allow the best segments to be
+     * referenced with the smallest offsets.
+     */
+    tail -= segmentSize;
+    memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+    DISPLAYUPDATE(
+        2, "\r%u%%       ",
+        (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+  }
+  DISPLAYLEVEL(2, "\r%79s\r", "");
+  return tail;
+}
+
+
+/**
+ * Parameters for FASTCOVER_tryParameters().
+ */
+typedef struct FASTCOVER_tryParameters_data_s {
+    const FASTCOVER_ctx_t* ctx;
+    COVER_best_t* best;
+    size_t dictBufferCapacity;
+    ZDICT_cover_params_t parameters;
+} FASTCOVER_tryParameters_data_t;
+
+
+/**
+ * Tries a set of parameters and updates the COVER_best_t with the results.
+ * This function is thread safe if zstd is compiled with multithreaded support.
+ * It takes its parameters as an *OWNING* opaque pointer to support threading.
+ */
+static void FASTCOVER_tryParameters(void *opaque)
+{
+  /* Save parameters as local variables */
+  FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque;
+  const FASTCOVER_ctx_t *const ctx = data->ctx;
+  const ZDICT_cover_params_t parameters = data->parameters;
+  size_t dictBufferCapacity = data->dictBufferCapacity;
+  size_t totalCompressedSize = ERROR(GENERIC);
+  /* Initialize array to keep track of frequency of dmer within activeSegment */
+  U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
+  /* Allocate space for hash table, dict, and freqs */
+  BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
+  if (!segmentFreqs || !dict || !freqs) {
+    DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
+    goto _cleanup;
+  }
+  /* Copy the frequencies because we need to modify them */
+  memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
+  /* Build the dictionary */
+  { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
+                                                  parameters, segmentFreqs);
+    const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
+    dictBufferCapacity = ZDICT_finalizeDictionary(
+        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
+    if (ZDICT_isError(dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+      goto _cleanup;
+    }
+  }
+  /* Check total compressed size */
+  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
+                                                       ctx->samples, ctx->offsets,
+                                                       ctx->nbTrainSamples, ctx->nbSamples,
+                                                       dict, dictBufferCapacity);
+_cleanup:
+  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
+                    dictBufferCapacity);
+  free(data);
+  free(segmentFreqs);
+  free(dict);
+  free(freqs);
+}
+
+
+static void
+FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
+                               ZDICT_cover_params_t* coverParams)
+{
+    coverParams->k = fastCoverParams.k;
+    coverParams->d = fastCoverParams.d;
+    coverParams->steps = fastCoverParams.steps;
+    coverParams->nbThreads = fastCoverParams.nbThreads;
+    coverParams->splitPoint = fastCoverParams.splitPoint;
+    coverParams->zParams = fastCoverParams.zParams;
+}
+
+
+static void
+FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
+                                   ZDICT_fastCover_params_t* fastCoverParams,
+                                   unsigned f, unsigned accel)
+{
+    fastCoverParams->k = coverParams.k;
+    fastCoverParams->d = coverParams.d;
+    fastCoverParams->steps = coverParams.steps;
+    fastCoverParams->nbThreads = coverParams.nbThreads;
+    fastCoverParams->splitPoint = coverParams.splitPoint;
+    fastCoverParams->f = f;
+    fastCoverParams->accel = accel;
+    fastCoverParams->zParams = coverParams.zParams;
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
+                                const void* samplesBuffer,
+                                const size_t* samplesSizes, unsigned nbSamples,
+                                ZDICT_fastCover_params_t parameters)
+{
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Assign splitPoint and f if not provided */
+    parameters.splitPoint = 1.0;
+    parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
+    parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
+    /* Convert to cover parameter */
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(parameters, &coverParams);
+    /* Checks */
+    if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
+                                   parameters.accel)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Assign corresponding FASTCOVER_accel_t to accelParams*/
+    accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            coverParams.d, parameters.splitPoint, parameters.f,
+                            accelParams)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      /* Initialize array to keep track of frequency of dmer within activeSegment */
+      U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, coverParams, segmentFreqs);
+      const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      free(segmentFreqs);
+      return dictionarySize;
+    }
+}
+
+
+ZDICTLIB_API size_t
+ZDICT_optimizeTrainFromBuffer_fastCover(
+                    void* dictBuffer, size_t dictBufferCapacity,
+                    const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters)
+{
+    ZDICT_cover_params_t coverParams;
+    FASTCOVER_accel_t accelParams;
+    /* constants */
+    const unsigned nbThreads = parameters->nbThreads;
+    const double splitPoint =
+        parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
+    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
+    const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+    const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+    const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+    const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
+    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
+    const unsigned kIterations =
+        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+    const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
+    const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    /* Local variables */
+    const int displayLevel = parameters->zParams.notificationLevel;
+    unsigned iteration = 1;
+    unsigned d;
+    unsigned k;
+    COVER_best_t best;
+    POOL_ctx *pool = NULL;
+    /* Checks */
+    if (splitPoint <= 0 || splitPoint > 1) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
+      return ERROR(GENERIC);
+    }
+    if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
+      return ERROR(GENERIC);
+    }
+    if (kMinK < kMaxD || kMaxK < kMinK) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    if (nbThreads > 1) {
+      pool = POOL_create(nbThreads, 1);
+      if (!pool) {
+        return ERROR(memory_allocation);
+      }
+    }
+    /* Initialization */
+    COVER_best_init(&best);
+    memset(&coverParams, 0 , sizeof(coverParams));
+    FASTCOVER_convertToCoverParams(*parameters, &coverParams);
+    accelParams = FASTCOVER_defaultAccelParameters[accel];
+    /* Turn down global display level to clean up display at level 2 and below */
+    g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
+    /* Loop through d first because each new value needs a new context */
+    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
+                      kIterations);
+    for (d = kMinD; d <= kMaxD; d += 2) {
+      /* Initialize the context for this value of d */
+      FASTCOVER_ctx_t ctx;
+      LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
+      if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams)) {
+        LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return ERROR(GENERIC);
+      }
+      /* Loop through k reusing the same context */
+      for (k = kMinK; k <= kMaxK; k += kStepSize) {
+        /* Prepare the arguments */
+        FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
+            sizeof(FASTCOVER_tryParameters_data_t));
+        LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
+        if (!data) {
+          LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
+          COVER_best_destroy(&best);
+          FASTCOVER_ctx_destroy(&ctx);
+          POOL_free(pool);
+          return ERROR(GENERIC);
+        }
+        data->ctx = &ctx;
+        data->best = &best;
+        data->dictBufferCapacity = dictBufferCapacity;
+        data->parameters = coverParams;
+        data->parameters.k = k;
+        data->parameters.d = d;
+        data->parameters.splitPoint = splitPoint;
+        data->parameters.steps = kSteps;
+        data->parameters.zParams.notificationLevel = g_displayLevel;
+        /* Check the parameters */
+        if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
+                                       data->ctx->f, accel)) {
+          DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+          free(data);
+          continue;
+        }
+        /* Call the function and pass ownership of data to it */
+        COVER_best_start(&best);
+        if (pool) {
+          POOL_add(pool, &FASTCOVER_tryParameters, data);
+        } else {
+          FASTCOVER_tryParameters(data);
+        }
+        /* Print status */
+        LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
+                           (U32)((iteration * 100) / kIterations));
+        ++iteration;
+      }
+      COVER_best_wait(&best);
+      FASTCOVER_ctx_destroy(&ctx);
+    }
+    LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
+    /* Fill the output buffer and parameters with output of the best parameters */
+    {
+      const size_t dictSize = best.dictSize;
+      if (ZSTD_isError(best.compressedSize)) {
+        const size_t compressedSize = best.compressedSize;
+        COVER_best_destroy(&best);
+        POOL_free(pool);
+        return compressedSize;
+      }
+      FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
+      memcpy(dictBuffer, best.dict, dictSize);
+      COVER_best_destroy(&best);
+      POOL_free(pool);
+      return dictSize;
+    }
+
+}
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index 7d24e4991812..2964b69fff04 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -293,7 +293,7 @@ static dictItem ZDICT_analyzePos(
             refinedEnd = refinedStart + selectedCount;
         }
 
-        /* evaluate gain based on new ref */
+        /* evaluate gain based on new dict */
         start = refinedStart;
         pos = suffix[refinedStart];
         end = start;
@@ -341,7 +341,7 @@ static dictItem ZDICT_analyzePos(
         for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
             savings[i] = savings[i-1] + (lengthList[i] * (i-3));
 
-        DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f)  \n",
+        DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f)  \n",
                      (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
 
         solution.pos = (U32)pos;
@@ -581,7 +581,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
 
 typedef struct
 {
-    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CDict* dict;    /* dictionary */
     ZSTD_CCtx* zc;     /* working context */
     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
 } EStats_ress_t;
@@ -597,8 +597,9 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
     size_t cSize;
 
     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
-    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
-        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    {   size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
+
     }
     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
@@ -697,7 +698,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
     short litLengthNCount[MaxLL+1];
     U32 repOffset[MAXREPOFFSET];
     offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];
-    EStats_ress_t esr;
+    EStats_ress_t esr = { NULL, NULL, NULL };
     ZSTD_parameters params;
     U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
     size_t pos = 0, errorCode;
@@ -708,14 +709,6 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
 
     /* init */
     DEBUGLOG(4, "ZDICT_analyzeEntropy");
-    esr.ref = ZSTD_createCCtx();
-    esr.zc = ZSTD_createCCtx();
-    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
-    if (!esr.ref || !esr.zc || !esr.workPlace) {
-        eSize = ERROR(memory_allocation);
-        DISPLAYLEVEL(1, "Not enough memory \n");
-        goto _cleanup;
-    }
     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
     for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
     for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
@@ -724,14 +717,17 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
     memset(repOffset, 0, sizeof(repOffset));
     repOffset[1] = repOffset[4] = repOffset[8] = 1;
     memset(bestRepOffset, 0, sizeof(bestRepOffset));
-    if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
+    if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
-    {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
-        if (ZSTD_isError(beginResult)) {
-            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
-            eSize = ERROR(GENERIC);
-            goto _cleanup;
-    }   }
+
+    esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);
+    esr.zc = ZSTD_createCCtx();
+    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
+    if (!esr.dict || !esr.zc || !esr.workPlace) {
+        eSize = ERROR(memory_allocation);
+        DISPLAYLEVEL(1, "Not enough memory \n");
+        goto _cleanup;
+    }
 
     /* collect stats on all samples */
     for (u=0; u<nbFiles; u++) {
@@ -856,7 +852,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
     eSize += 12;
 
 _cleanup:
-    ZSTD_freeCCtx(esr.ref);
+    ZSTD_freeCDict(esr.dict);
     ZSTD_freeCCtx(esr.zc);
     free(esr.workPlace);
 
@@ -867,13 +863,13 @@ _cleanup:
 
 size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                           const void* customDictContent, size_t dictContentSize,
-                          const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                          ZDICT_params_t params)
+                          const void* samplesBuffer, const size_t* samplesSizes,
+                          unsigned nbSamples, ZDICT_params_t params)
 {
     size_t hSize;
 #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
     BYTE header[HBUFFSIZE];
-    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
 
     /* check conditions */
@@ -914,11 +910,12 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
 }
 
 
-size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
-                                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                                 ZDICT_params_t params)
+static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
+        void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
+        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+        ZDICT_params_t params)
 {
-    int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
+    int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
     size_t hSize = 8;
 
@@ -947,7 +944,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
     return MIN(dictBufferCapacity, hSize+dictContentSize);
 }
 
-
+/* Hidden declaration for dbio.c */
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
+                            void* dictBuffer, size_t maxDictSize,
+                            const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                            ZDICT_legacy_params_t params);
 /*! ZDICT_trainFromBuffer_unsafe_legacy() :
 *   Warning : `samplesBuffer` must be followed by noisy guard band.
 *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
@@ -991,8 +992,10 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
             U32 const pos = dictList[u].pos;
             U32 const length = dictList[u].length;
             U32 const printedLength = MIN(40, length);
-            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize))
+            if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {
+                free(dictList);
                 return ERROR(GENERIC);   /* should never happen */
+            }
             DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
                          u, length, pos, dictList[u].savings);
             ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
@@ -1082,17 +1085,17 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
-    ZDICT_cover_params_t params;
+    ZDICT_fastCover_params_t params;
     DEBUGLOG(3, "ZDICT_trainFromBuffer");
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
     /* Default to level 6 since no compression level information is available */
-    params.zParams.compressionLevel = 6;
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
-    params.zParams.notificationLevel = ZSTD_DEBUG;
+    params.zParams.compressionLevel = 3;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
+    params.zParams.notificationLevel = DEBUGLEVEL;
 #endif
-    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
                                                samplesBuffer, samplesSizes, nbSamples,
                                                &params);
 }
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h
index ad459c2d7d53..d57d59f01e72 100644
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -39,7 +39,8 @@ extern "C" {
 
 /*! ZDICT_trainFromBuffer():
  *  Train a dictionary from an array of samples.
- *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
+ *  f=20, and accel=1.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  *  The resulting dictionary will be saved into `dictBuffer`.
@@ -52,7 +53,8 @@ extern "C" {
  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer,
+                                    const size_t* samplesSizes, unsigned nbSamples);
 
 
 /*======   Helper functions   ======*/
@@ -84,11 +86,22 @@ typedef struct {
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
     unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
-    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
     unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
     ZDICT_params_t zParams;
 } ZDICT_cover_params_t;
 
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
+    unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
+    unsigned f;                  /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
+    unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
+    unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
+    double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
+    unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    ZDICT_params_t zParams;
+} ZDICT_fastCover_params_t;
 
 /*! ZDICT_trainFromBuffer_cover():
  *  Train a dictionary from an array of samples using the COVER algorithm.
@@ -115,9 +128,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
  * dictionary constructed with those parameters is stored in `dictBuffer`.
  *
  * All of the parameters d, k, steps are optional.
- * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
  * if steps is zero it defaults to its default value.
- * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *           or an error code, which can be tested with ZDICT_isError().
@@ -129,6 +142,48 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
           ZDICT_cover_params_t* parameters);
 
+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  d and k are required.
+ *  All other parameters are optional, will use default values if not provided
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
+                    size_t dictBufferCapacity, const void *samplesBuffer,
+                    const size_t *samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_fastCover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations (specifically, k and d combinations)
+ * and picks the best parameters. `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
+ * All of the parameters d, k, steps, f, and accel are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
+ * If f is zero, default value of 20 is used.
+ * If accel is zero, default value of 1 is used.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
+                    size_t dictBufferCapacity, const void* samplesBuffer,
+                    const size_t* samplesSizes, unsigned nbSamples,
+                    ZDICT_fastCover_params_t* parameters);
+
 /*! ZDICT_finalizeDictionary():
  * Given a custom content as a basis for dictionary, and a set of samples,
  * finalize dictionary by adding headers and statistics.
diff --git a/lib/legacy/zstd_v01.c b/lib/legacy/zstd_v01.c
index ae1cb2ce5aa0..c007e7ceb518 100644
--- a/lib/legacy/zstd_v01.c
+++ b/lib/legacy/zstd_v01.c
@@ -668,11 +668,17 @@ static size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t
         switch(srcSize)
         {
             case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+                    /* fallthrough */
             case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+                    /* fallthrough */
             case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+                    /* fallthrough */
             case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+                    /* fallthrough */
             case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+                    /* fallthrough */
             case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+                    /* fallthrough */
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
@@ -1458,7 +1464,7 @@ unsigned ZSTDv01_isError(size_t code) { return ERR_isError(code); }
 *   Decompression code
 **************************************************************/
 
-size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     BYTE headerFlags;
@@ -1511,7 +1517,7 @@ static size_t ZSTD_decompressLiterals(void* ctx,
 }
 
 
-size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
+static size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
                                 void* dst, size_t maxDstSize,
                           const BYTE** litStart, size_t* litSize,
                           const void* src, size_t srcSize)
@@ -1563,7 +1569,7 @@ size_t ZSTDv01_decodeLiteralsBlock(void* ctx,
 }
 
 
-size_t ZSTDv01_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+static size_t ZSTDv01_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
                          FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
                          const void* src, size_t srcSize)
 {
diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c
index 8bc0eceeda8f..c09ef8cff23e 100644
--- a/lib/legacy/zstd_v02.c
+++ b/lib/legacy/zstd_v02.c
@@ -399,11 +399,17 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         switch(srcSize)
         {
             case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+                    /* fallthrough */
             case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+                    /* fallthrough */
             case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+                    /* fallthrough */
             case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+                    /* fallthrough */
             case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+                    /* fallthrough */
             case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+                    /* fallthrough */
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
diff --git a/lib/legacy/zstd_v03.c b/lib/legacy/zstd_v03.c
index 54445af577ed..0c4cdf6888a7 100644
--- a/lib/legacy/zstd_v03.c
+++ b/lib/legacy/zstd_v03.c
@@ -402,11 +402,17 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         switch(srcSize)
         {
             case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+                    /* fallthrough */
             case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+                    /* fallthrough */
             case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+                    /* fallthrough */
             case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+                    /* fallthrough */
             case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+                    /* fallthrough */
             case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+                    /* fallthrough */
             default:;
         }
         contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c
index fb6d1d4b15c8..e852bb91116a 100644
--- a/lib/legacy/zstd_v04.c
+++ b/lib/legacy/zstd_v04.c
@@ -9,14 +9,19 @@
  */
 
 
-/*- Dependencies -*/
+ /******************************************
+ *  Includes
+ ******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+#include <string.h>    /* memcpy */
+
 #include "zstd_v04.h"
 #include "error_private.h"
 
 
 /* ******************************************************************
-   mem.h
-****************************************************************** */
+ *   mem.h
+ *******************************************************************/
 #ifndef MEM_H_MODULE
 #define MEM_H_MODULE
 
@@ -24,12 +29,6 @@
 extern "C" {
 #endif
 
-/******************************************
-*  Includes
-******************************************/
-#include <stddef.h>    /* size_t, ptrdiff_t */
-#include <string.h>    /* memcpy */
-
 
 /******************************************
 *  Compiler-specific
@@ -75,38 +74,9 @@ extern "C" {
 /*-*************************************
 *  Debug
 ***************************************/
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
-#  include <assert.h>
-#else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
-#endif
-
-#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
-
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
-#  include <stdio.h>
-extern int g_debuglog_enable;
-/* recommended values for ZSTD_DEBUG display levels :
- * 1 : no display, enables assert() only
- * 2 : reserved for currently active debug path
- * 3 : events once per object lifetime (CCtx, CDict, etc.)
- * 4 : events once per frame
- * 5 : events once per block
- * 6 : events once per sequence (*very* verbose) */
-#  define RAWLOG(l, ...) {                                      \
-                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
-                    fprintf(stderr, __VA_ARGS__);               \
-            }   }
-#  define DEBUGLOG(l, ...) {                                    \
-                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
-                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
-                    fprintf(stderr, " \n");                     \
-            }   }
-#else
-#  define RAWLOG(l, ...)      {}    /* disabled */
-#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#include "debug.h"
+#ifndef assert
+#  define assert(condition) ((void)0)
 #endif
 
 
@@ -266,14 +236,6 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr)
 #ifndef ZSTD_STATIC_H
 #define ZSTD_STATIC_H
 
-/* The objects defined into this file shall be considered experimental.
- * They are not considered stable, as their prototype may change in the future.
- * You can use them for tests, provide feedback, or if you can endure risks of future changes.
- */
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
 
 /* *************************************
 *  Types
@@ -360,9 +322,6 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstS
 */
 
 
-#if defined (__cplusplus)
-}
-#endif
 
 
 #endif  /* ZSTD_STATIC_H */
@@ -375,10 +334,6 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstS
 #ifndef ZSTD_CCOMMON_H_MODULE
 #define ZSTD_CCOMMON_H_MODULE
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /* *************************************
 *  Common macros
 ***************************************/
@@ -450,10 +405,6 @@ static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
 }
 
 
-#if defined (__cplusplus)
-}
-#endif
-
 
 /* ******************************************************************
    FSE : Finite State Entropy coder
@@ -1142,6 +1093,7 @@ static size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, un
     if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
 
     /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSE_DECODE_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
     DTableH.tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++)
     {
@@ -2991,7 +2943,7 @@ static size_t ZSTD_execSequence(BYTE* op,
     }
     else
     {
-        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8, but must be signed */
     }
     return sequenceLength;
 }
@@ -3670,8 +3622,3 @@ size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDs
 
 ZSTD_DCtx* ZSTDv04_createDCtx(void) { return ZSTD_createDCtx(); }
 size_t ZSTDv04_freeDCtx(ZSTD_DCtx* dctx) { return ZSTD_freeDCtx(dctx); }
-
-size_t ZSTDv04_getFrameParams(ZSTD_parameters* params, const void* src, size_t srcSize)
-{
-    return ZSTD_getFrameParams(params, src, srcSize);
-}
diff --git a/lib/legacy/zstd_v05.c b/lib/legacy/zstd_v05.c
index a5e1b1ffc8ce..a1580a271eea 100644
--- a/lib/legacy/zstd_v05.c
+++ b/lib/legacy/zstd_v05.c
@@ -1224,6 +1224,7 @@ size_t FSEv05_buildDTable(FSEv05_DTable* dt, const short* normalizedCounter, uns
     if (tableLog > FSEv05_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
 
     /* Init, lay down lowprob symbols */
+    memset(tableDecode, 0, sizeof(FSEv05_FUNCTION_TYPE) * (maxSymbolValue+1) );   /* useless init, but keep static analyzer happy, and we don't need to performance optimize legacy decoders */
     DTableH.tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++) {
         if (normalizedCounter[s]==-1) {
@@ -2658,6 +2659,7 @@ struct ZSTDv05_DCtx_s
     BYTE headerBuffer[ZSTDv05_frameHeaderSize_max];
 };  /* typedef'd to ZSTDv05_DCtx within "zstd_static.h" */
 
+size_t ZSTDv05_sizeofDCtx (void); /* Hidden declaration */
 size_t ZSTDv05_sizeofDCtx (void) { return sizeof(ZSTDv05_DCtx); }
 
 size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx)
@@ -2822,7 +2824,7 @@ static size_t ZSTDv05_decodeFrameHeader_Part2(ZSTDv05_DCtx* zc, const void* src,
 }
 
 
-size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     BYTE headerFlags;
@@ -2845,6 +2847,7 @@ size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t*
 
 static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -2853,8 +2856,8 @@ static size_t ZSTDv05_copyRawBlock(void* dst, size_t maxDstSize, const void* src
 
 /*! ZSTDv05_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
-size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
-                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+static size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
+                                    const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
 
@@ -2988,7 +2991,7 @@ size_t ZSTDv05_decodeLiteralsBlock(ZSTDv05_DCtx* dctx,
 }
 
 
-size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
+static size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t* dumpsLengthPtr,
                          FSEv05_DTable* DTableLL, FSEv05_DTable* DTableML, FSEv05_DTable* DTableOffb,
                          const void* src, size_t srcSize, U32 flagStaticTable)
 {
@@ -3297,11 +3300,11 @@ static size_t ZSTDv05_decompressSequences(
     BYTE* const ostart = (BYTE* const)dst;
     BYTE* op = ostart;
     BYTE* const oend = ostart + maxDstSize;
-    size_t errorCode, dumpsLength;
+    size_t errorCode, dumpsLength=0;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
-    int nbSeq;
-    const BYTE* dumps;
+    int nbSeq=0;
+    const BYTE* dumps = NULL;
     U32* DTableLL = dctx->LLTable;
     U32* DTableML = dctx->MLTable;
     U32* DTableOffb = dctx->OffTable;
@@ -3410,10 +3413,10 @@ static size_t ZSTDv05_decompress_continueDCtx(ZSTDv05_DCtx* dctx,
     BYTE* const oend = ostart + maxDstSize;
     size_t remainingSize = srcSize;
     blockProperties_t blockProperties;
+    memset(&blockProperties, 0, sizeof(blockProperties));
 
     /* Frame Header */
-    {
-        size_t frameHeaderSize;
+    {   size_t frameHeaderSize;
         if (srcSize < ZSTDv05_frameHeaderSize_min+ZSTDv05_blockHeaderSize) return ERROR(srcSize_wrong);
         frameHeaderSize = ZSTDv05_decodeFrameHeader_Part1(dctx, src, ZSTDv05_frameHeaderSize_min);
         if (ZSTDv05_isError(frameHeaderSize)) return frameHeaderSize;
diff --git a/lib/legacy/zstd_v06.c b/lib/legacy/zstd_v06.c
index 8b068b3e5464..60d8d6fd9a9b 100644
--- a/lib/legacy/zstd_v06.c
+++ b/lib/legacy/zstd_v06.c
@@ -1250,9 +1250,7 @@ const char* FSEv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
 /* **************************************************************
 *  HUF Error Management
 ****************************************************************/
-unsigned HUFv06_isError(size_t code) { return ERR_isError(code); }
-
-const char* HUFv06_getErrorName(size_t code) { return ERR_getErrorName(code); }
+static unsigned HUFv06_isError(size_t code) { return ERR_isError(code); }
 
 
 /*-**************************************************************
@@ -2823,7 +2821,8 @@ struct ZSTDv06_DCtx_s
     BYTE headerBuffer[ZSTDv06_FRAMEHEADERSIZE_MAX];
 };  /* typedef'd to ZSTDv06_DCtx within "zstd_static.h" */
 
-size_t ZSTDv06_sizeofDCtx (void) { return sizeof(ZSTDv06_DCtx); }   /* non published interface */
+size_t ZSTDv06_sizeofDCtx (void); /* Hidden declaration */
+size_t ZSTDv06_sizeofDCtx (void) { return sizeof(ZSTDv06_DCtx); }
 
 size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx)
 {
@@ -3022,7 +3021,7 @@ typedef struct
 
 /*! ZSTDv06_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
-size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     U32 cSize;
@@ -3041,6 +3040,7 @@ size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t*
 
 static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
+    if (dst==NULL) return ERROR(dstSize_tooSmall);
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
     return srcSize;
@@ -3049,7 +3049,7 @@ static size_t ZSTDv06_copyRawBlock(void* dst, size_t dstCapacity, const void* sr
 
 /*! ZSTDv06_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
-size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
+static size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
                           const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
@@ -3183,7 +3183,7 @@ size_t ZSTDv06_decodeLiteralsBlock(ZSTDv06_DCtx* dctx,
     @return : nb bytes read from src,
               or an error code if it fails, testable with ZSTDv06_isError()
 */
-size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLog,
+static size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
 {
@@ -3213,7 +3213,7 @@ size_t ZSTDv06_buildSeqTable(FSEv06_DTable* DTable, U32 type, U32 max, U32 maxLo
 }
 
 
-size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr,
+static size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr,
                              FSEv06_DTable* DTableLL, FSEv06_DTable* DTableML, FSEv06_DTable* DTableOffb, U32 flagRepeatTable,
                              const void* src, size_t srcSize)
 {
@@ -3358,7 +3358,7 @@ static void ZSTDv06_decodeSequence(seq_t* seq, seqState_t* seqState)
 }
 
 
-size_t ZSTDv06_execSequence(BYTE* op,
+static size_t ZSTDv06_execSequence(BYTE* op,
                                 BYTE* const oend, seq_t sequence,
                                 const BYTE** litPtr, const BYTE* const litLimit,
                                 const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
@@ -4006,7 +4006,7 @@ size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd,
                     if (ZSTDv06_isError(hSize)) return hSize;
                     if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
                         memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip);
-                        zbd->lhSize += iend-ip; ip = iend; notDone = 0;
+                        zbd->lhSize += iend-ip;
                         *dstCapacityPtr = 0;
                         return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
diff --git a/lib/legacy/zstd_v07.c b/lib/legacy/zstd_v07.c
index 70b170f0f154..c7bb7a529813 100644
--- a/lib/legacy/zstd_v07.c
+++ b/lib/legacy/zstd_v07.c
@@ -2628,7 +2628,7 @@ const char* ZBUFFv07_getErrorName(size_t errorCode) { return ERR_getErrorName(er
 
 
 
-void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
+static void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
 {
     void* address = malloc(size);
     (void)opaque;
@@ -2636,7 +2636,7 @@ void* ZSTDv07_defaultAllocFunction(void* opaque, size_t size)
     return address;
 }
 
-void ZSTDv07_defaultFreeFunction(void* opaque, void* address)
+static void ZSTDv07_defaultFreeFunction(void* opaque, void* address)
 {
     (void)opaque;
     /* if (address) printf("free %p opaque=%p \n", address, opaque); */
@@ -3150,10 +3150,10 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
     const BYTE* ip = (const BYTE*)src;
 
     if (srcSize < ZSTDv07_frameHeaderSize_min) return ZSTDv07_frameHeaderSize_min;
+    memset(fparamsPtr, 0, sizeof(*fparamsPtr));
     if (MEM_readLE32(src) != ZSTDv07_MAGICNUMBER) {
         if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTDv07_MAGIC_SKIPPABLE_START) {
             if (srcSize < ZSTDv07_skippableHeaderSize) return ZSTDv07_skippableHeaderSize; /* magic number + skippable frame length */
-            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
             fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
             fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
             return 0;
@@ -3175,11 +3175,13 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
         U32 windowSize = 0;
         U32 dictID = 0;
         U64 frameContentSize = 0;
-        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if ((fhdByte & 0x08) != 0)   /* reserved bits, which must be zero */
+            return ERROR(frameParameter_unsupported);
         if (!directMode) {
             BYTE const wlByte = ip[pos++];
             U32 const windowLog = (wlByte >> 3) + ZSTDv07_WINDOWLOG_ABSOLUTEMIN;
-            if (windowLog > ZSTDv07_WINDOWLOG_MAX) return ERROR(frameParameter_unsupported);
+            if (windowLog > ZSTDv07_WINDOWLOG_MAX)
+                return ERROR(frameParameter_unsupported);
             windowSize = (1U << windowLog);
             windowSize += (windowSize >> 3) * (wlByte&7);
         }
@@ -3201,7 +3203,8 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
             case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
         }
         if (!windowSize) windowSize = (U32)frameContentSize;
-        if (windowSize > windowSizeMax) return ERROR(frameParameter_unsupported);
+        if (windowSize > windowSizeMax)
+            return ERROR(frameParameter_unsupported);
         fparamsPtr->frameContentSize = frameContentSize;
         fparamsPtr->windowSize = windowSize;
         fparamsPtr->dictID = dictID;
@@ -3220,11 +3223,10 @@ size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src,
                    - frame header not completely provided (`srcSize` too small) */
 unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize)
 {
-    {   ZSTDv07_frameParams fparams;
-        size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
-        if (frResult!=0) return 0;
-        return fparams.frameContentSize;
-    }
+    ZSTDv07_frameParams fparams;
+    size_t const frResult = ZSTDv07_getFrameParams(&fparams, src, srcSize);
+    if (frResult!=0) return 0;
+    return fparams.frameContentSize;
 }
 
 
@@ -3248,7 +3250,7 @@ typedef struct
 
 /*! ZSTDv07_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
-size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+static size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
 {
     const BYTE* const in = (const BYTE* const)src;
     U32 cSize;
@@ -3275,7 +3277,7 @@ static size_t ZSTDv07_copyRawBlock(void* dst, size_t dstCapacity, const void* sr
 
 /*! ZSTDv07_decodeLiteralsBlock() :
     @return : nb of bytes read from src (< srcSize ) */
-size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
+static size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
                           const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
 {
     const BYTE* const istart = (const BYTE*) src;
@@ -3409,7 +3411,7 @@ size_t ZSTDv07_decodeLiteralsBlock(ZSTDv07_DCtx* dctx,
     @return : nb bytes read from src,
               or an error code if it fails, testable with ZSTDv07_isError()
 */
-size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLog,
+static size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLog,
                                  const void* src, size_t srcSize,
                                  const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable)
 {
@@ -3439,7 +3441,7 @@ size_t ZSTDv07_buildSeqTable(FSEv07_DTable* DTable, U32 type, U32 max, U32 maxLo
 }
 
 
-size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr,
+static size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr,
                              FSEv07_DTable* DTableLL, FSEv07_DTable* DTableML, FSEv07_DTable* DTableOffb, U32 flagRepeatTable,
                              const void* src, size_t srcSize)
 {
@@ -3771,7 +3773,7 @@ ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockS
 }
 
 
-size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+static size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
 {
     if (length > dstCapacity) return ERROR(dstSize_tooSmall);
     memset(dst, byte, length);
@@ -3851,7 +3853,7 @@ static size_t ZSTDv07_decompressFrame(ZSTDv07_DCtx* dctx,
 *   It avoids reloading the dictionary each time.
 *   `preparedDCtx` must have been properly initialized using ZSTDv07_decompressBegin_usingDict().
 *   Requires 2 contexts : 1 for reference (preparedDCtx), which will not be modified, and 1 to run the decompression operation (dctx) */
-size_t ZSTDv07_decompress_usingPreparedDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* refDCtx,
+static size_t ZSTDv07_decompress_usingPreparedDCtx(ZSTDv07_DCtx* dctx, const ZSTDv07_DCtx* refDCtx,
                                          void* dst, size_t dstCapacity,
                                    const void* src, size_t srcSize)
 {
@@ -4146,7 +4148,7 @@ struct ZSTDv07_DDict_s {
     ZSTDv07_DCtx* refContext;
 };  /* typedef'd tp ZSTDv07_CDict within zstd.h */
 
-ZSTDv07_DDict* ZSTDv07_createDDict_advanced(const void* dict, size_t dictSize, ZSTDv07_customMem customMem)
+static ZSTDv07_DDict* ZSTDv07_createDDict_advanced(const void* dict, size_t dictSize, ZSTDv07_customMem customMem)
 {
     if (!customMem.customAlloc && !customMem.customFree)
         customMem = defaultCustomMem;
diff --git a/lib/zstd.h b/lib/zstd.h
index 6405da602e8f..f2af4ac8c429 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -35,31 +35,43 @@ extern "C" {
 #endif
 
 
-/*******************************************************************************************************
+/*******************************************************************************
   Introduction
 
-  zstd, short for Zstandard, is a fast lossless compression algorithm,
-  targeting real-time compression scenarios at zlib-level and better compression ratios.
-  The zstd compression library provides in-memory compression and decompression functions.
-  The library supports compression levels from 1 up to ZSTD_maxCLevel() which is currently 22.
-  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
   Compression can be done in:
     - a single step (described as Simple API)
     - a single step, reusing a context (described as Explicit context)
     - unbounded multiple steps (described as Streaming compression)
-  The compression ratio achievable on small data can be highly improved using a dictionary in:
-    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)
 
-  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
-  Advanced experimental APIs shall never be used with a dynamic library.
-  They are not "stable", their definition may change in the future. Only static linking is allowed.
-*********************************************************************************************************/
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
 
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    3
-#define ZSTD_VERSION_RELEASE  4
+#define ZSTD_VERSION_RELEASE  7
 
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
@@ -68,8 +80,14 @@ ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll versio
 #define ZSTD_QUOTE(str) #str
 #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
 #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
-ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* v1.3.0+ */
 
+/***************************************
+*  Default constant
+***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
 
 /***************************************
 *  Simple API
@@ -96,7 +114,7 @@ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
  *  `src` should point to the start of a ZSTD encoded frame.
  *  `srcSize` must be at least as large as the frame header.
  *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
- *  @return : - decompressed size of the frame in `src`, if known
+ *  @return : - decompressed size of `src` frame content, if known
  *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
  *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
  *   note 1 : a 0 return value means the frame is valid but "empty".
@@ -106,7 +124,8 @@ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
  *            Optionally, application can rely on some implicit limit,
  *            as ZSTD_decompress() only needs an upper bound of decompressed size.
  *            (For example, data could be necessarily cut into blocks <= 16 KB).
- *   note 3 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
  *   note 4 : decompressed size can be very large (64-bits value),
  *            potentially larger than what local system can handle as a single memory segment.
  *            In which case, it's necessary to use streaming mode to decompress data.
@@ -123,8 +142,7 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
  *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
  *  "empty", "unknown" and "error" results to the same return value (0),
  *  while ZSTD_getFrameContentSize() gives them separate return values.
- * `src` is the start of a zstd compressed frame.
- * @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 
@@ -205,7 +223,8 @@ typedef struct ZSTD_CDict_s ZSTD_CDict;
  *  When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
  *  ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
  *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
- *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict */
+ *  `dictBuffer` can be released after ZSTD_CDict creation, since its content is copied within CDict
+ *  Note : A ZSTD_CDict can be created with an empty dictionary, but it is inefficient for small data. */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
                                          int compressionLevel);
 
@@ -217,7 +236,9 @@ ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
  *  Compression using a digested Dictionary.
  *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
  *  Note that compression level is decided during dictionary creation.
- *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no)
+ *  Note : ZSTD_compress_usingCDict() can be used with a ZSTD_CDict created from an empty dictionary.
+ *         But it is inefficient for small data, and it is recommended to use ZSTD_compressCCtx(). */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                             void* dst, size_t dstCapacity,
                                       const void* src, size_t srcSize,
@@ -272,39 +293,44 @@ typedef struct ZSTD_outBuffer_s {
 *  since it will play nicer with system's memory, by re-using already allocated memory.
 *  Use one separate ZSTD_CStream per thread for parallel execution.
 *
-*  Start a new compression by initializing ZSTD_CStream.
+*  Start a new compression by initializing ZSTD_CStream context.
 *  Use ZSTD_initCStream() to start a new compression operation.
-*  Use ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for a compression which requires a dictionary (experimental section)
+*  Use variants ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for streaming with dictionary (experimental section)
 *
-*  Use ZSTD_compressStream() repetitively to consume input stream.
-*  The function will automatically update both `pos` fields.
-*  Note that it may not consume the entire input, in which case `pos < size`,
-*  and it's up to the caller to present again remaining data.
+*  Use ZSTD_compressStream() as many times as necessary to consume input stream.
+*  The function will automatically update both `pos` fields within `input` and `output`.
+*  Note that the function may not consume the entire input,
+*  for example, because the output buffer is already full,
+*  in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  typically by emptying output buffer, or allocating a new output buffer,
+*  and then present again remaining input data.
 *  @return : a size hint, preferred nb of bytes to use as input for next function call
 *            or an error code, which can be tested using ZSTD_isError().
 *            Note 1 : it's just a hint, to help latency a little, any other value will work fine.
 *            Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
 *
-*  At any moment, it's possible to flush whatever data remains within internal buffer, using ZSTD_flushStream().
-*  `output->pos` will be updated.
-*  Note that some content might still be left within internal buffer if `output->size` is too small.
-*  @return : nb of bytes still present within internal buffer (0 if it's empty)
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_flushStream(). `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation of ZSTD_flushStream() might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_flushStream().
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
 *            or an error code, which can be tested using ZSTD_isError().
 *
 *  ZSTD_endStream() instructs to finish a frame.
 *  It will perform a flush and write frame epilogue.
 *  The epilogue is required for decoders to consider a frame completed.
-*  ZSTD_endStream() may not be able to flush full data if `output->size` is too small.
-*  In which case, call again ZSTD_endStream() to complete the flush.
+*  flush() operation is the same, and follows same rules as ZSTD_flushStream().
 *  @return : 0 if frame fully completed and fully flushed,
-             or >0 if some data is still present within internal buffer
-                  (value is minimum size estimation for remaining data to flush, but it could be more)
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
 *            or an error code, which can be tested using ZSTD_isError().
 *
 * *******************************************************************/
 
 typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
-                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
 /*===== ZSTD_CStream management functions =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
 ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
@@ -335,15 +361,21 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output
 *  The function will update both `pos` fields.
 *  If `input.pos < input.size`, some input has not been consumed.
 *  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, repecting buffer sizes.
 *  If `output.pos < output.size`, decoder has flushed everything it could.
-*  @return : 0 when a frame is completely decoded and fully flushed,
-*            an error code, which can be tested using ZSTD_isError(),
-*            any other value > 0, which means there is still some decoding to do to complete current frame.
-*            The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame.
+*  But if `output.pos == output.size`, there is no such guarantee,
+*  it's likely that some decoded data was not flushed and still remains within internal buffers.
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  When no additional input is provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (a hint for better latency)
+*                                that will never load more than the current frame.
 * *******************************************************************************/
 
 typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
-                                 /* For compatibility with versions <= v1.2.0, continue to consider them separated. */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
 /*===== ZSTD_DStream management functions =====*/
 ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
 ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
@@ -359,21 +391,28 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output
 
 
 
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
 /****************************************************************************************
- * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
  * The definitions in this section are considered experimental.
  * They should never be used with a dynamic library, as prototypes may change in the future.
  * They are provided for advanced scenarios.
  * Use them only in association with static linking.
  * ***************************************************************************************/
 
-#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
-#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+ZSTDLIB_API int ZSTD_minCLevel(void);  /*!< minimum negative compression level allowed */
 
-/* --- Constants ---*/
-#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+/* ---  Constants  ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* v0.8+ */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* v0.7+ */
 #define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
-#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */
+
+#define ZSTD_BLOCKSIZELOG_MAX 17
+#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
 
 #define ZSTD_WINDOWLOG_MAX_32   30
 #define ZSTD_WINDOWLOG_MAX_64   31
@@ -390,9 +429,10 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output
 #define ZSTD_SEARCHLOG_MIN       1
 #define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
 #define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
-#define ZSTD_TARGETLENGTH_MIN    1   /* only used by btopt, btultra and btfast */
-#define ZSTD_LDM_MINMATCH_MIN    4
+#define ZSTD_TARGETLENGTH_MAX  ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN    0   /* note : comparing this constant to an unsigned results in a tautological test */
 #define ZSTD_LDM_MINMATCH_MAX 4096
+#define ZSTD_LDM_MINMATCH_MIN    4
 #define ZSTD_LDM_BUCKETSIZELOG_MAX 8
 
 #define ZSTD_FRAMEHEADERSIZE_PREFIX 5   /* minimum input size to know frame header size */
@@ -404,7 +444,8 @@ static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
 static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
 
 
-/*--- Advanced types ---*/
+
+/* ---  Advanced types  --- */
 typedef enum { ZSTD_fast=1, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2,
                ZSTD_btlazy2, ZSTD_btopt, ZSTD_btultra } ZSTD_strategy;   /* from faster to stronger */
 
@@ -480,9 +521,9 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
 ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
 
 /*! ZSTD_frameHeaderSize() :
-*   `src` should point to the start of a ZSTD frame
-*   `srcSize` must be >= ZSTD_frameHeaderSize_prefix.
-*   @return : size of the Frame Header */
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
 ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
 
@@ -711,29 +752,48 @@ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const
 
 /*! ZSTD_resetCStream() :
  *  start a new compression job, using same parameters from previous job.
- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
  *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
  *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
- * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ */
 ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
 typedef struct {
-    unsigned long long ingested;
-    unsigned long long consumed;
-    unsigned long long produced;
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
 } ZSTD_frameProgression;
 
-/* ZSTD_getFrameProgression():
+/* ZSTD_getFrameProgression() :
  * tells how much data has been ingested (read from input)
  * consumed (input actually compressed) and produced (output) for current frame.
- * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed.
- * Can report progression inside worker threads (multi-threading and non-blocking mode).
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
  */
-ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flushing speed is currently limited by production speed of oldest job
+ *    irrespective of the speed of concurrent newer jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
 
 
 
@@ -880,6 +940,11 @@ typedef struct {
     unsigned dictID;
     unsigned checksumFlag;
 } ZSTD_frameHeader;
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
 ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
 ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
@@ -901,23 +966,15 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 /**       New advanced API (experimental)       */
 /* ============================================ */
 
-/* notes on API design :
- *   In this proposal, parameters are pushed one by one into an existing context,
- *   and then applied on all subsequent compression jobs.
- *   When no parameter is ever provided, CCtx is created with compression level ZSTD_CLEVEL_DEFAULT.
+/* API design :
+ *   In this advanced API, parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are applied to next job, and any subsequent job.
+ *   It's possible to reset parameters to "default" using ZSTD_CCtx_reset().
+ *   Important : "sticky" parameters only work with `ZSTD_compress_generic()` !
+ *               For any other entry point, "sticky" parameters are ignored !
  *
  *   This API is intended to replace all others advanced / experimental API entry points.
- *   But it stands a reasonable chance to become "stable", after a reasonable testing period.
- */
-
-/* note on naming convention :
- *   Initially, the API favored names like ZSTD_setCCtxParameter() .
- *   In this proposal, convention is changed towards ZSTD_CCtx_setParameter() .
- *   The main driver is that it identifies more clearly the target object type.
- *   It feels clearer when considering multiple targets :
- *   ZSTD_CDict_setParameter() (rather than ZSTD_setCDictParameter())
- *   ZSTD_CCtxParams_setParameter()  (rather than ZSTD_setCCtxParamsParameter() )
- *   etc...
  */
 
 /* note on enum design :
@@ -947,7 +1004,7 @@ typedef enum {
     /* compression parameters */
     ZSTD_p_compressionLevel=100, /* Update all compression parameters according to pre-defined cLevel table
                               * Default level is ZSTD_CLEVEL_DEFAULT==3.
-                              * Special: value 0 means "do not change cLevel".
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
                               * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
                               * Note 2 : setting a level sets all default values of other compression parameters.
                               * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
@@ -956,16 +1013,19 @@ typedef enum {
                               * Special: value 0 means "use default windowLog".
                               * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
                               *       requires explicitly allowing such window size during decompression stage. */
-    ZSTD_p_hashLog,          /* Size of the probe table, as a power of 2.
+    ZSTD_p_hashLog,          /* Size of the initial probe table, as a power of 2.
                               * Resulting table size is (1 << (hashLog+2)).
                               * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
                               * Larger tables improve compression ratio of strategies <= dFast,
                               * and improve speed of strategies > dFast.
                               * Special: value 0 means "use default hashLog". */
-    ZSTD_p_chainLog,         /* Size of the full-search table, as a power of 2.
+    ZSTD_p_chainLog,         /* Size of the multi-probe search table, as a power of 2.
                               * Resulting table size is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
                               * Larger tables result in better and slower compression.
                               * This parameter is useless when using "fast" strategy.
+                              * Note it's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
                               * Special: value 0 means "use default chainLog". */
     ZSTD_p_searchLog,        /* Number of search attempts, as a power of 2.
                               * More attempts result in better and slower compression.
@@ -1047,27 +1107,52 @@ typedef enum {
     /* experimental parameters - no stability guaranteed                   */
     /* =================================================================== */
 
-    ZSTD_p_compressLiterals=1000, /* control huffman compression of literals (enabled) by default.
-                              * disabling it improves speed and decreases compression ratio by a large amount.
-                              * note : this setting is automatically updated when changing compression level.
-                              *        positive compression levels set ZSTD_p_compressLiterals to 1.
-                              *        negative compression levels set ZSTD_p_compressLiterals to 0. */
-
     ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize,
                               * even when referencing into Dictionary content (default:0) */
+    ZSTD_p_forceAttachDict,  /* ZSTD supports usage of a CDict in-place
+                              * (avoiding having to copy the compression tables
+                              * from the CDict into the working context). Using
+                              * a CDict in this way saves an initial setup step,
+                              * but comes at the cost of more work per byte of
+                              * input. ZSTD has a simple internal heuristic that
+                              * guesses which strategy will be faster. You can
+                              * use this flag to override that guess.
+                              *
+                              * Note that the by-reference, in-place strategy is
+                              * only used when reusing a compression context
+                              * with compatible compression parameters. (If
+                              * incompatible / uninitialized, the working
+                              * context needs to be cleared anyways, which is
+                              * about as expensive as overwriting it with the
+                              * dictionary context, so there's no savings in
+                              * using the CDict by-ref.)
+                              *
+                              * Values greater than 0 force attaching the dict.
+                              * Values less than 0 force copying the dict.
+                              * 0 selects the default heuristic-guided behavior.
+                              */
 
 } ZSTD_cParameter;
 
 
 /*! ZSTD_CCtx_setParameter() :
  *  Set one compression parameter, selected by enum ZSTD_cParameter.
- *  Setting a parameter is generally only possible during frame initialization (before starting compression),
- *  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
- *  Note : when `value` is an enum, cast it to unsigned for proper type checking.
- *  @result : informational value (typically, value being set clamped correctly),
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbThreads >= 1),
+ *              following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active on next job, or after a flush().
+ *  Note : when `value` type is not unsigned (int, or enum), cast it to unsigned for proper type checking.
+ *  @result : informational value (typically, value being set, correctly clamped),
  *            or an error code (which can be tested with ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
 
+/*! ZSTD_CCtx_getParameter() :
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned* value);
+
 /*! ZSTD_CCtx_setPledgedSrcSize() :
  *  Total input data size to be compressed as a single frame.
  *  This value will be controlled at the end, and result in error if not respected.
@@ -1114,36 +1199,55 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
 
 /*! ZSTD_CCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Decompression need same prefix to properly regenerate data.
- *  Prefix is **only used once**. Tables are discarded at end of compression job.
- *  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
- *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  Note that prefix is **only used once**. Tables are discarded at end of compression job (ZSTD_e_end).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
- *  Note 1 : Prefix buffer is referenced. It must outlive compression job.
- *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression job.
+ *           Its contain must remain unmodified up to end of compression (ZSTD_e_end).
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_p_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
  *           It's a CPU consuming operation, with non-negligible impact on latency.
- *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           If there is a need to use same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
  *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                       const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx,
+                                       const void* prefix, size_t prefixSize,
+                                       ZSTD_dictContentType_e dictContentType);
 
 /*! ZSTD_CCtx_reset() :
  *  Return a CCtx to clean state.
  *  Useful after an error, or to interrupt an ongoing compression job and start a new one.
  *  Any internal data not yet flushed is cancelled.
- *  Dictionary (if any) is dropped.
- *  All parameters are back to default values.
- *  It's possible to modify compression parameters after a reset.
+ *  The parameters and dictionary are kept unchanged, to reset them use ZSTD_CCtx_resetParameters().
  */
 ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);
 
+/*! ZSTD_CCtx_resetParameters() :
+ *  All parameters are back to default values (compression level is ZSTD_CLEVEL_DEFAULT).
+ *  Dictionary (if any) is dropped.
+ *  Resetting parameters is only possible during frame initialization (before starting compression).
+ *  To reset the context use ZSTD_CCtx_reset().
+ *  @return 0 or an error code (which can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_resetParameters(ZSTD_CCtx* cctx);
+
 
 
 typedef enum {
-    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal conditions */
-    ZSTD_e_flush,      /* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */
-    ZSTD_e_end         /* flush any remaining data and close current frame. Any additional data starts a new frame. */
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush,      /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression. */
+    ZSTD_e_end         /* flush any remaining data and close current frame.
+                        * any additional data starts a new frame.
+                        * each frame is independent (does not reference any content from previous frame). */
 } ZSTD_EndDirective;
 
 /*! ZSTD_compress_generic() :
@@ -1235,6 +1339,13 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, Z
  */
 ZSTDLIB_API size_t ZSTD_CCtxParam_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned value);
 
+/*! ZSTD_CCtxParam_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParam_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, unsigned* value);
+
 /*! ZSTD_CCtx_setParametersUsingCCtxParams() :
  *  Apply a set of ZSTD_CCtx_params to the compression context.
  *  This can be done even after compression is started,
@@ -1246,10 +1357,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
 
 
-/*===   Advanced parameters for decompression API  ===*/
+/* ==================================== */
+/*===   Advanced decompression API   ===*/
+/* ==================================== */
 
-/* The following parameters must be set after creating a ZSTD_DCtx* (or ZSTD_DStream*) object,
- * but before starting decompression of a frame.
+/* The following API works the same way as the advanced compression API :
+ * a context is created, parameters are pushed into it one by one,
+ * then the context can be used to decompress data using an interface similar to the straming API.
  */
 
 /*! ZSTD_DCtx_loadDictionary() :
@@ -1286,17 +1400,25 @@ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
 
 /*! ZSTD_DCtx_refPrefix() :
  *  Reference a prefix (single-usage dictionary) for next compression job.
- *  Prefix is **only used once**. It must be explicitly referenced before each frame.
- *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_DDict instead.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_DCtx_decompress_generic() returns 0.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
- *  Note 2 : Prefix buffer is referenced. It must outlive compression job.
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression job.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_DCtx_decompress_generic() returns 0.
  *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
  *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode.
  *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A fulldict prefix is more costly though.
  */
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                    const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx,
+                                    const void* prefix, size_t prefixSize,
+                                    ZSTD_dictContentType_e dictContentType);
 
 
 /*! ZSTD_DCtx_setMaxWindowSize() :
@@ -1318,6 +1440,13 @@ ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowS
 ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr,
+                        const void* src, size_t srcSize, ZSTD_format_e format);
+
+
 /*! ZSTD_decompress_generic() :
  *  Behave the same as ZSTD_decompressStream.
  *  Decompression parameters cannot be changed once decompression is started.
@@ -1383,8 +1512,6 @@ ZSTDLIB_API void ZSTD_DCtx_reset(ZSTD_DCtx* dctx);
         Use ZSTD_insertBlock() for such a case.
 */
 
-#define ZSTD_BLOCKSIZELOG_MAX 17
-#define ZSTD_BLOCKSIZE_MAX   (1<<ZSTD_BLOCKSIZELOG_MAX)   /* define, for static allocation */
 /*=====   Raw zstd block functions  =====*/
 ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
 ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
diff --git a/programs/Makefile b/programs/Makefile
index be666b4ef054..32dbc67eff5c 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -27,9 +27,11 @@ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
 LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
 LIBVER  := $(shell echo $(LIBVER_SCRIPT))
 
-ZSTD_VERSION=$(LIBVER)
+ZSTD_VERSION = $(LIBVER)
 
-ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version "), 1)
+GREP = grep --color=never
+
+ifeq ($(shell $(CC) -v 2>&1 | $(GREP) -c "gcc version "), 1)
 ALIGN_LOOP = -falign-loops=32
 else
 ALIGN_LOOP =
@@ -38,12 +40,15 @@ endif
 CPPFLAGS+= -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
            -I$(ZSTDDIR)/dictBuilder \
            -DXXH_NAMESPACE=ZSTD_
+ifeq ($(OS),Windows_NT)   # MinGW assumed
+CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
+endif
 CFLAGS  ?= -O3
 DEBUGFLAGS+=-Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
             -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
             -Wstrict-prototypes -Wundef -Wpointer-arith -Wformat-security \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-            -Wredundant-decls
+            -Wredundant-decls -Wmissing-prototypes
 CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
 FLAGS    = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
 
@@ -55,11 +60,11 @@ ZSTD_FILES := $(ZSTDDECOMP_FILES) $(ZSTDCOMMON_FILES) $(ZSTDCOMP_FILES)
 ZDICT_FILES := $(ZSTDDIR)/dictBuilder/*.c
 ZSTDDECOMP_O = $(ZSTDDIR)/decompress/zstd_decompress.o
 
-ZSTD_LEGACY_SUPPORT ?= 4
+ZSTD_LEGACY_SUPPORT ?= 5
 ZSTDLEGACY_FILES :=
 ifneq ($(ZSTD_LEGACY_SUPPORT), 0)
 ifeq ($(shell test $(ZSTD_LEGACY_SUPPORT) -lt 8; echo $$?), 0)
-	ZSTDLEGACY_FILES += $(shell ls $(ZSTDDIR)/legacy/*.c | grep 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
+	ZSTDLEGACY_FILES += $(shell ls $(ZSTDDIR)/legacy/*.c | $(GREP) 'v0[$(ZSTD_LEGACY_SUPPORT)-7]')
 endif
 	CPPFLAGS += -I$(ZSTDDIR)/legacy
 else
@@ -129,6 +134,18 @@ else
 LZ4_MSG := $(NO_LZ4_MSG)
 endif
 
+# explicit backtrace enable/disable for Linux & Darwin
+ifeq ($(BACKTRACE), 0)
+DEBUGFLAGS += -DBACKTRACE_ENABLE=0
+endif
+ifeq (,$(filter Windows%, $(OS)))
+ifeq ($(BACKTRACE), 1)
+DEBUGFLAGS += -DBACKTRACE_ENABLE=1
+DEBUGFLAGS_LD += -rdynamic
+endif
+endif
+
+
 .PHONY: default
 default: zstd-release
 
@@ -141,7 +158,7 @@ allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-nolegacy
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)
 
 zstd : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP) $(LZMACPP) $(LZ4CPP)
-zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD)
+zstd : LDFLAGS += $(THREAD_LD) $(ZLIBLD) $(LZMALD) $(LZ4LD) $(DEBUGFLAGS_LD)
 zstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
 	@echo "$(THREAD_MSG)"
@@ -154,11 +171,12 @@ endif
 	$(CC) $(FLAGS) $^ $(RES_FILE) -o $@$(EXT) $(LDFLAGS)
 
 .PHONY: zstd-release
-zstd-release: DEBUGFLAGS :=
+zstd-release: DEBUGFLAGS := -DBACKTRACE_ENABLE=0
+zstd-release: DEBUGFLAGS_LD :=
 zstd-release: zstd
 
 zstd32 : CPPFLAGS += $(THREAD_CPP)
-zstd32 : LDFLAGS += $(THREAD_LD) 
+zstd32 : LDFLAGS  += $(THREAD_LD)
 zstd32 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd32 : $(ZSTDLIB_FILES) zstdcli.c fileio.c bench.c datagen.c dibio.c
 ifneq (,$(filter Windows%,$(OS)))
@@ -170,17 +188,17 @@ zstd-nolegacy : $(ZSTD_FILES) $(ZDICT_FILES) zstdcli.o fileio.c bench.o datagen.
 	$(CC) $(FLAGS) $^ -o $@$(EXT) $(LDFLAGS)
 
 zstd-nomt : THREAD_CPP :=
-zstd-nomt : THREAD_LD :=
+zstd-nomt : THREAD_LD  :=
 zstd-nomt : THREAD_MSG := - multi-threading disabled
 zstd-nomt : zstd
 
 zstd-nogz : ZLIBCPP :=
-zstd-nogz : ZLIBLD :=
+zstd-nogz : ZLIBLD  :=
 zstd-nogz : ZLIB_MSG := - gzip support is disabled
 zstd-nogz : zstd
 
 zstd-noxz : LZMACPP :=
-zstd-noxz : LZMALD :=
+zstd-noxz : LZMALD  :=
 zstd-noxz : LZMA_MSG := - xz/lzma support is disabled
 zstd-noxz : zstd
 
@@ -231,25 +249,53 @@ MD2ROFF_FLAGS = --roff --warnings --manual="User Commands" --organization="zstd
 zstd.1: zstd.1.md ../lib/zstd.h
 	cat $< | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@
 
+zstdgrep.1: zstdgrep.1.md ../lib/zstd.h
+	cat $< | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@
+
+zstdless.1: zstdless.1.md ../lib/zstd.h
+	cat $< | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@
+
 .PHONY: man
-man: zstd.1
+man: zstd.1 zstdgrep.1 zstdless.1
 
 .PHONY: clean-man
 clean-man:
 	rm zstd.1
+	rm zstdgrep.1
+	rm zstdless.1
 
 .PHONY: preview-man
 preview-man: clean-man man
 	man ./zstd.1
+	man ./zstdgrep.1
+	man ./zstdless.1
 
 #-----------------------------------------------------------------------------
-# make install is validated only for Linux, OSX, BSD, Hurd and Solaris targets
+# make install is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #-----------------------------------------------------------------------------
-ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
+ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS Haiku))
 
+EGREP = egrep --color=never
+
+# Print a two column output of targets and their description. To add a target description, put a
+# comment in the Makefile with the format "## <TARGET>: <DESCRIPTION>".  For example:
+#
+## list: Print all targets and their descriptions (if provided)
 .PHONY: list
 list:
-	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+	@TARGETS=$$($(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null \
+		| awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' \
+		| $(EGREP) -v  -e '^[^[:alnum:]]' | sort); \
+	{ \
+	    printf "Target Name\tDescription\n"; \
+	    printf "%0.s-" {1..16}; printf "\t"; printf "%0.s-" {1..40}; printf "\n"; \
+	    for target in $$TARGETS; do \
+	        line=$$($(EGREP) "^##[[:space:]]+$$target:" $(lastword $(MAKEFILE_LIST))); \
+	        description=$$(echo $$line | awk '{i=index($$0,":"); print substr($$0,i+1)}' | xargs); \
+	        printf "$$target\t$$description\n"; \
+	    done \
+	} | column -t -s $$'\t'
+
 
 DESTDIR     ?=
 # directory variables : GNU conventions prefer lowercase
@@ -296,6 +342,8 @@ install: zstd
 	@$(INSTALL_MAN) zstd.1 $(DESTDIR)$(MAN1DIR)/zstd.1
 	@ln -sf zstd.1 $(DESTDIR)$(MAN1DIR)/zstdcat.1
 	@ln -sf zstd.1 $(DESTDIR)$(MAN1DIR)/unzstd.1
+	@$(INSTALL_MAN) zstdgrep.1 $(DESTDIR)$(MAN1DIR)/zstdgrep.1
+	@$(INSTALL_MAN) zstdless.1 $(DESTDIR)$(MAN1DIR)/zstdless.1
 	@echo zstd installation completed
 
 .PHONY: uninstall
@@ -305,6 +353,8 @@ uninstall:
 	@$(RM) $(DESTDIR)$(BINDIR)/zstdcat
 	@$(RM) $(DESTDIR)$(BINDIR)/unzstd
 	@$(RM) $(DESTDIR)$(BINDIR)/zstd
+	@$(RM) $(DESTDIR)$(MAN1DIR)/zstdless.1
+	@$(RM) $(DESTDIR)$(MAN1DIR)/zstdgrep.1
 	@$(RM) $(DESTDIR)$(MAN1DIR)/zstdcat.1
 	@$(RM) $(DESTDIR)$(MAN1DIR)/unzstd.1
 	@$(RM) $(DESTDIR)$(MAN1DIR)/zstd.1
diff --git a/programs/README.md b/programs/README.md
index a308fccf9ea3..ca9056eaaa43 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -61,6 +61,13 @@ There are however other Makefile targets that create different variations of CLI
   In which case, linking stage will fail if `lz4` library cannot be found.
   This is useful to prevent silent feature disabling.
 
+- __BACKTRACE__ : `zstd` can display a stack backtrace when execution
+  generates a runtime exception. By default, this feature may be
+  degraded/disabled on some platforms unless additional compiler directives are
+  applied. When triaging a runtime issue, enabling this feature can provide
+  more context to determine the location of the fault.
+  Example : `make zstd BACKTRACE=1`
+
 
 #### Aggregation of parameters
 CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
@@ -150,7 +157,8 @@ Advanced arguments :
 
 Dictionary builder :
 --train ## : create a dictionary from a training set of files
---train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args
+--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args
+--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fastcover algorithm with optional args
 --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
  -o file : `file` is dictionary name (default: dictionary)
 --maxdict=# : limit dictionary to specified size (default: 112640)
@@ -185,7 +193,7 @@ version is less than `128 MiB`).
 
 Compression Speed vs Ratio | Decompression Speed
 ---------------------------|---------------------
-![Compression Speed vs Ratio](../doc/images/ldmCspeed.png "Compression Speed vs Ratio") | ![Decompression Speed](../doc/images/ldmDspeed.png "Decompression Speed")
+![Compression Speed vs Ratio](https://raw.githubusercontent.com/facebook/zstd/v1.3.3/doc/images/ldmCspeed.png "Compression Speed vs Ratio") | ![Decompression Speed](https://raw.githubusercontent.com/facebook/zstd/v1.3.3/doc/images/ldmDspeed.png "Decompression Speed")
 
 | Method | Compression ratio | Compression speed | Decompression speed  |
 |:-------|------------------:|-------------------------:|---------------------------:|
@@ -208,10 +216,24 @@ The below table illustrates this on the [Silesia compression corpus].
 [Silesia compression corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
 
 | Method | Compression ratio | Compression speed | Decompression speed  |
-|:-------|------------------:|-------------------------:|---------------------------:|
-| `zstd -1`   | `2.878`   | `231.7 MB/s`  | `594.4 MB/s`  |
-| `zstd -1 --long` | `2.929` | `106.5 MB/s` | `517.9 MB/s` |
-| `zstd -5`  | `3.274`    | `77.1 MB/s`  | `464.2 MB/s`  |
-| `zstd -5 --long` | `3.319` | `51.7 MB/s` | `371.9 MB/s` |
-| `zstd -10` | `3.523`    | `16.4 MB/s`   | `489.2 MB/s`  |
-| `zstd -10 --long`| `3.566` | `16.2 MB/s` | `415.7 MB/s`  |
+|:-------|------------------:|------------------:|---------------------:|
+| `zstd -1`        | `2.878` | `231.7 MB/s`      | `594.4 MB/s`   |
+| `zstd -1 --long` | `2.929` | `106.5 MB/s`      | `517.9 MB/s`   |
+| `zstd -5`        | `3.274` | `77.1 MB/s`       | `464.2 MB/s`   |
+| `zstd -5 --long` | `3.319` | `51.7 MB/s`       | `371.9 MB/s`   |
+| `zstd -10`       | `3.523` | `16.4 MB/s`       | `489.2 MB/s`   |
+| `zstd -10 --long`| `3.566` | `16.2 MB/s`       | `415.7 MB/s`   |
+
+
+#### zstdgrep
+
+`zstdgrep` is a utility which makes it possible to `grep` directly a `.zst` compressed file.
+It's used the same way as normal `grep`, for example :
+`zstdgrep pattern file.zst`
+
+`zstdgrep` is _not_ compatible with dictionary compression.
+
+To search into a file compressed with a dictionary,
+it's necessary to decompress it using `zstd` or `zstdcat`,
+and then pipe the result to `grep`. For example  :
+`zstdcat -D dictionary -qc -- file.zst | grep pattern`
diff --git a/programs/bench.c b/programs/bench.c
index 014a4fd41b1c..326c1c1c56e5 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -41,6 +41,8 @@
 #include "zstd.h"
 #include "datagen.h"     /* RDG_genBuffer */
 #include "xxhash.h"
+#include "bench.h"
+#include "zstd_errors.h"
 
 
 /* *************************************
@@ -61,25 +63,27 @@
 #define MB *(1 <<20)
 #define GB *(1U<<30)
 
-static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t)(1ULL << ((sizeof(size_t)*8)-31));
+#define BMK_RUNTEST_DEFAULT_MS 1000
 
-static U32 g_compressibilityDefault = 50;
+static const size_t maxMemory = (sizeof(size_t)==4)  ?
+                    /* 32-bit */ (2 GB - 64 MB) :
+                    /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t)*8)-31));
 
 
 /* *************************************
 *  console display
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
-#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static int g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+/* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
 
 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 
-#define DISPLAYUPDATE(l, ...) { if (g_displayLevel>=l) { \
-            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (g_displayLevel>=4)) \
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
             { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stderr); } } }
+            if (displayLevel>=4) fflush(stderr); } } }
 
 
 /* *************************************
@@ -89,83 +93,57 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 #  define DEBUG 0
 #endif
 #define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
-#define EXM_THROW(error, ...)  {                      \
+
+#define EXM_THROW_INT(errorNum, ...)  {               \
     DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
-    DISPLAYLEVEL(1, "Error %i : ", error);            \
+    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
     DISPLAYLEVEL(1, __VA_ARGS__);                     \
     DISPLAYLEVEL(1, " \n");                           \
-    exit(error);                                      \
+    return errorNum;                                  \
 }
 
+#define RETURN_ERROR(errorNum, retType, ...)  {       \
+    retType r;                                        \
+    memset(&r, 0, sizeof(retType));                   \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DISPLAYLEVEL(1, "Error %i : ", errorNum);         \
+    DISPLAYLEVEL(1, __VA_ARGS__);                     \
+    DISPLAYLEVEL(1, " \n");                           \
+    r.tag = errorNum;                                 \
+    return r;                                         \
+}
+
+/* error without displaying */
+#define RETURN_QUIET_ERROR(errorNum, retType, ...)  { \
+    retType r;                                        \
+    memset(&r, 0, sizeof(retType));                   \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__);    \
+    DEBUGOUTPUT("Error %i : ", errorNum);             \
+    DEBUGOUTPUT(__VA_ARGS__);                         \
+    DEBUGOUTPUT(" \n");                               \
+    r.tag = errorNum;                                 \
+    return r;                                         \
+}
 
 /* *************************************
 *  Benchmark Parameters
 ***************************************/
-static int g_additionalParam = 0;
-static U32 g_decodeOnly = 0;
 
-void BMK_setNotificationLevel(unsigned level) { g_displayLevel=level; }
-
-void BMK_setAdditionalParam(int additionalParam) { g_additionalParam=additionalParam; }
-
-static U32 g_nbSeconds = BMK_TIMETEST_DEFAULT_S;
-void BMK_setNbSeconds(unsigned nbSeconds)
-{
-    g_nbSeconds = nbSeconds;
-    DISPLAYLEVEL(3, "- test >= %u seconds per compression / decompression - \n", g_nbSeconds);
-}
-
-static size_t g_blockSize = 0;
-void BMK_setBlockSize(size_t blockSize)
-{
-    g_blockSize = blockSize;
-    if (g_blockSize) DISPLAYLEVEL(2, "using blocks of size %u KB \n", (U32)(blockSize>>10));
-}
-
-void BMK_setDecodeOnlyMode(unsigned decodeFlag) { g_decodeOnly = (decodeFlag>0); }
-
-static U32 g_nbWorkers = 0;
-void BMK_setNbWorkers(unsigned nbWorkers) {
-#ifndef ZSTD_MULTITHREAD
-    if (nbWorkers > 0) DISPLAYLEVEL(2, "Note : multi-threading is disabled \n");
-#endif
-    g_nbWorkers = nbWorkers;
-}
-
-static U32 g_realTime = 0;
-void BMK_setRealTime(unsigned priority) {
-    g_realTime = (priority>0);
-}
-
-static U32 g_separateFiles = 0;
-void BMK_setSeparateFiles(unsigned separate) {
-    g_separateFiles = (separate>0);
-}
-
-static U32 g_ldmFlag = 0;
-void BMK_setLdmFlag(unsigned ldmFlag) {
-    g_ldmFlag = ldmFlag;
-}
-
-static U32 g_ldmMinMatch = 0;
-void BMK_setLdmMinMatch(unsigned ldmMinMatch) {
-    g_ldmMinMatch = ldmMinMatch;
-}
-
-static U32 g_ldmHashLog = 0;
-void BMK_setLdmHashLog(unsigned ldmHashLog) {
-    g_ldmHashLog = ldmHashLog;
-}
-
-#define BMK_LDM_PARAM_NOTSET 9999
-static U32 g_ldmBucketSizeLog = BMK_LDM_PARAM_NOTSET;
-void BMK_setLdmBucketSizeLog(unsigned ldmBucketSizeLog) {
-    g_ldmBucketSizeLog = ldmBucketSizeLog;
-}
-
-static U32 g_ldmHashEveryLog = BMK_LDM_PARAM_NOTSET;
-void BMK_setLdmHashEveryLog(unsigned ldmHashEveryLog) {
-    g_ldmHashEveryLog = ldmHashEveryLog;
+BMK_advancedParams_t BMK_initAdvancedParams(void) {
+    BMK_advancedParams_t const res = {
+        BMK_both, /* mode */
+        BMK_TIMETEST_DEFAULT_S, /* nbSeconds */
+        0, /* blockSize */
+        0, /* nbWorkers */
+        0, /* realTime */
+        0, /* additionalParam */
+        0, /* ldmFlag */
+        0, /* ldmMinMatch */
+        0, /* ldmHashLog */
+        0, /* ldmBuckSizeLog */
+        0  /* ldmHashEveryLog */
+    };
+    return res;
 }
 
 
@@ -182,319 +160,774 @@ typedef struct {
     size_t resSize;
 } blockParam_t;
 
-
-
 #undef MIN
 #undef MAX
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 #define MAX(a,b)    ((a) > (b) ? (a) : (b))
 
-static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
-                        const char* displayName, int cLevel,
-                        const size_t* fileSizes, U32 nbFiles,
-                        const void* dictBuffer, size_t dictBufferSize,
-                        const ZSTD_compressionParameters* const comprParams)
+static void BMK_initCCtx(ZSTD_CCtx* ctx,
+    const void* dictBuffer, size_t dictBufferSize, int cLevel,
+    const ZSTD_compressionParameters* comprParams, const BMK_advancedParams_t* adv) {
+    ZSTD_CCtx_reset(ctx);
+    ZSTD_CCtx_resetParameters(ctx);
+    if (adv->nbWorkers==1) {
+        ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, 0);
+    } else {
+        ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, adv->nbWorkers);
+    }
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionLevel, cLevel);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_enableLongDistanceMatching, adv->ldmFlag);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmMinMatch, adv->ldmMinMatch);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashLog, adv->ldmHashLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmBucketSizeLog, adv->ldmBucketSizeLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashEveryLog, adv->ldmHashEveryLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_windowLog, comprParams->windowLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_hashLog, comprParams->hashLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_chainLog, comprParams->chainLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_searchLog, comprParams->searchLog);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_minMatch, comprParams->searchLength);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_targetLength, comprParams->targetLength);
+    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionStrategy, comprParams->strategy);
+    ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize);
+}
+
+static void BMK_initDCtx(ZSTD_DCtx* dctx,
+    const void* dictBuffer, size_t dictBufferSize) {
+    ZSTD_DCtx_reset(dctx);
+    ZSTD_DCtx_loadDictionary(dctx, dictBuffer, dictBufferSize);
+}
+
+
+typedef struct {
+    ZSTD_CCtx* cctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+    int cLevel;
+    const ZSTD_compressionParameters* comprParams;
+    const BMK_advancedParams_t* adv;
+} BMK_initCCtxArgs;
+
+static size_t local_initCCtx(void* payload) {
+    BMK_initCCtxArgs* ag = (BMK_initCCtxArgs*)payload;
+    BMK_initCCtx(ag->cctx, ag->dictBuffer, ag->dictBufferSize, ag->cLevel, ag->comprParams, ag->adv);
+    return 0;
+}
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+} BMK_initDCtxArgs;
+
+static size_t local_initDCtx(void* payload) {
+    BMK_initDCtxArgs* ag = (BMK_initDCtxArgs*)payload;
+    BMK_initDCtx(ag->dctx, ag->dictBuffer, ag->dictBufferSize);
+    return 0;
+}
+
+
+/* `addArgs` is the context */
+static size_t local_defaultCompress(
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstSize,
+                    void* addArgs)
 {
-    size_t const blockSize = ((g_blockSize>=32 && !g_decodeOnly) ? g_blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
-    U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
-    blockParam_t* const blockTable = (blockParam_t*) malloc(maxNbBlocks * sizeof(blockParam_t));
-    size_t const maxCompressedSize = ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);   /* add some room for safety */
-    void* const compressedBuffer = malloc(maxCompressedSize);
-    void* resultBuffer = malloc(srcSize);
-    ZSTD_CCtx* const ctx = ZSTD_createCCtx();
-    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+    size_t moreToFlush = 1;
+    ZSTD_CCtx* const cctx = (ZSTD_CCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstSize; out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_compress_generic(cctx, &out, &in, ZSTD_e_end);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+}
+
+/* `addArgs` is the context */
+static size_t local_defaultDecompress(
+                    const void* srcBuffer, size_t srcSize,
+                    void* dstBuffer, size_t dstCapacity,
+                    void* addArgs)
+{
+    size_t moreToFlush = 1;
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer; in.size = srcSize; in.pos = 0;
+    out.dst = dstBuffer; out.size = dstCapacity; out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_decompress_generic(dctx, &out, &in);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+
+}
+
+
+/*===  Benchmarking an arbitrary function  ===*/
+
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)
+{
+    return outcome.tag == 0;
+}
+
+/* warning : this function will stop program execution if outcome is invalid !
+ *           check outcome validity first, using BMK_isValid_runResult() */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_runOutcome_t BMK_runOutcome_error(void)
+{
+    BMK_runOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
+{
+    BMK_runOutcome_t outcome;
+    outcome.tag = 0;
+    outcome.internal_never_use_directly = runTime;
+    return outcome;
+}
+
+
+/* initFn will be measured once, benchFn will be measured `nbLoops` times */
+/* initFn is optional, provide NULL if none */
+/* benchFn must return size_t field compliant with ZSTD_isError for error valuee */
+/* takes # of blocks and list of size & stuff for each. */
+/* can report result of benchFn for each block into blockResult. */
+/* blockResult is optional, provide NULL if this information is not required */
+/* note : time per loop could be zero if run time < timer resolution */
+BMK_runOutcome_t BMK_benchFunction(
+            BMK_benchFn_t benchFn, void* benchPayload,
+            BMK_initFn_t initFn, void* initPayload,
+            size_t blockCount,
+            const void* const * srcBlockBuffers, const size_t* srcBlockSizes,
+            void* const * dstBlockBuffers, const size_t* dstBlockCapacities,
+            size_t* blockResults,
+            unsigned nbLoops)
+{
+    size_t dstSize = 0;
+
+    if(!nbLoops) {
+        RETURN_QUIET_ERROR(2, BMK_runOutcome_t, "nbLoops must be nonzero ");
+    }
+
+    /* init */
+    {   size_t i;
+        for(i = 0; i < blockCount; i++) {
+            memset(dstBlockBuffers[i], 0xE5, dstBlockCapacities[i]);  /* warm up and erase result buffer */
+        }
+#if 0
+        /* based on testing these seem to lower accuracy of multiple calls of 1 nbLoops vs 1 call of multiple nbLoops
+         * (Makes former slower)
+         */
+        UTIL_sleepMilli(5);  /* give processor time to other processes */
+        UTIL_waitForNextTick();
+#endif
+    }
+
+    /* benchmark */
+    {   UTIL_time_t const clockStart = UTIL_getTime();
+        unsigned loopNb, blockNb;
+        if (initFn != NULL) initFn(initPayload);
+        for (loopNb = 0; loopNb < nbLoops; loopNb++) {
+            for (blockNb = 0; blockNb < blockCount; blockNb++) {
+                size_t const res = benchFn(srcBlockBuffers[blockNb], srcBlockSizes[blockNb],
+                                    dstBlockBuffers[blockNb], dstBlockCapacities[blockNb],
+                                    benchPayload);
+                if(ZSTD_isError(res)) {
+                    RETURN_QUIET_ERROR(2, BMK_runOutcome_t,
+                        "Function benchmark failed on block %u of size %u : %s",
+                        blockNb, (U32)dstBlockCapacities[blockNb], ZSTD_getErrorName(res));
+                } else if (loopNb == 0) {
+                    dstSize += res;
+                    if (blockResults != NULL) blockResults[blockNb] = res;
+            }   }
+        }  /* for (loopNb = 0; loopNb < nbLoops; loopNb++) */
+
+        {   U64 const totalTime = UTIL_clockSpanNano(clockStart);
+            BMK_runTime_t rt;
+            rt.nanoSecPerRun = totalTime / nbLoops;
+            rt.sumOfReturn = dstSize;
+            return BMK_setValid_runTime(rt);
+    }   }
+}
+
+
+/* ====  Benchmarking any function, providing intermediate results  ==== */
+
+struct BMK_timedFnState_s {
+    U64 timeSpent_ns;
+    U64 timeBudget_ns;
+    U64 runBudget_ns;
+    BMK_runTime_t fastestRun;
+    unsigned nbLoops;
+    UTIL_time_t coolTime;
+};  /* typedef'd to BMK_timedFnState_t within bench.h */
+
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
+{
+    BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
+    if (r == NULL) return NULL;   /* malloc() error */
+    BMK_resetTimedFnState(r, total_ms, run_ms);
+    return r;
+}
+
+void BMK_freeTimedFnState(BMK_timedFnState_t* state) {
+    free(state);
+}
+
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
+{
+    if (!total_ms) total_ms = 1 ;
+    if (!run_ms) run_ms = 1;
+    if (run_ms > total_ms) run_ms = total_ms;
+    timedFnState->timeSpent_ns = 0;
+    timedFnState->timeBudget_ns = (U64)total_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->runBudget_ns = (U64)run_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->fastestRun.nanoSecPerRun = (U64)(-1LL);
+    timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
+    timedFnState->nbLoops = 1;
+    timedFnState->coolTime = UTIL_getTime();
+}
+
+/* Tells if nb of seconds set in timedFnState for all runs is spent.
+ * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState)
+{
+    return (timedFnState->timeSpent_ns >= timedFnState->timeBudget_ns);
+}
+
+
+#define MINUSABLETIME  (TIMELOOP_NANOSEC / 2)  /* 0.5 seconds */
+
+BMK_runOutcome_t BMK_benchTimedFn(
+            BMK_timedFnState_t* cont,
+            BMK_benchFn_t benchFn, void* benchPayload,
+            BMK_initFn_t initFn, void* initPayload,
+            size_t blockCount,
+            const void* const* srcBlockBuffers, const size_t* srcBlockSizes,
+            void * const * dstBlockBuffers, const size_t * dstBlockCapacities,
+            size_t* blockResults)
+{
+    U64 const runBudget_ns = cont->runBudget_ns;
+    U64 const runTimeMin_ns = runBudget_ns / 2;
+    int completed = 0;
+    BMK_runTime_t bestRunTime = cont->fastestRun;
+
+    while (!completed) {
+        BMK_runOutcome_t runResult;
+
+        /* Overheat protection */
+        if (UTIL_clockSpanMicro(cont->coolTime) > ACTIVEPERIOD_MICROSEC) {
+            DEBUGOUTPUT("\rcooling down ...    \r");
+            UTIL_sleep(COOLPERIOD_SEC);
+            cont->coolTime = UTIL_getTime();
+        }
+
+        /* reinitialize capacity */
+        runResult = BMK_benchFunction(benchFn, benchPayload,
+                                    initFn, initPayload,
+                                    blockCount,
+                                    srcBlockBuffers, srcBlockSizes,
+                                    dstBlockBuffers, dstBlockCapacities,
+                                    blockResults,
+                                    cont->nbLoops);
+
+        if(!BMK_isSuccessful_runOutcome(runResult)) { /* error : move out */
+            return BMK_runOutcome_error();
+        }
+
+        {   BMK_runTime_t const newRunTime = BMK_extract_runTime(runResult);
+            U64 const loopDuration_ns = newRunTime.nanoSecPerRun * cont->nbLoops;
+
+            cont->timeSpent_ns += loopDuration_ns;
+
+            /* estimate nbLoops for next run to last approximately 1 second */
+            if (loopDuration_ns > (runBudget_ns / 50)) {
+                U64 const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
+                cont->nbLoops = (U32)(runBudget_ns / fastestRun_ns) + 1;
+            } else {
+                /* previous run was too short : blindly increase workload by x multiplier */
+                const unsigned multiplier = 10;
+                assert(cont->nbLoops < ((unsigned)-1) / multiplier);  /* avoid overflow */
+                cont->nbLoops *= multiplier;
+            }
+
+            if(loopDuration_ns < runTimeMin_ns) {
+                /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
+                assert(completed == 0);
+                continue;
+            } else {
+                if(newRunTime.nanoSecPerRun < bestRunTime.nanoSecPerRun) {
+                    bestRunTime = newRunTime;
+                }
+                completed = 1;
+            }
+        }
+    }   /* while (!completed) */
+
+    return BMK_setValid_runTime(bestRunTime);
+}
+
+
+/* ================================================================= */
+/*      Benchmark Zstandard, mem-to-mem scenarios                    */
+/* ================================================================= */
+
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome)
+{
+    return outcome.tag == 0;
+}
+
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome)
+{
+    assert(outcome.tag == 0);
+    return outcome.internal_never_use_directly;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_error(void)
+{
+    BMK_benchOutcome_t b;
+    memset(&b, 0, sizeof(b));
+    b.tag = 1;
+    return b;
+}
+
+static BMK_benchOutcome_t BMK_benchOutcome_setValidResult(BMK_benchResult_t result)
+{
+    BMK_benchOutcome_t b;
+    b.tag = 0;
+    b.internal_never_use_directly = result;
+    return b;
+}
+
+
+/* benchMem with no allocation */
+static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc(
+            const void** srcPtrs, size_t* srcSizes,
+            void** cPtrs, size_t* cCapacities, size_t* cSizes,
+            void** resPtrs, size_t* resSizes,
+            void** resultBufferPtr, void* compressedBuffer,
+            size_t maxCompressedSize,
+            BMK_timedFnState_t* timeStateCompress,
+            BMK_timedFnState_t* timeStateDecompress,
+
+            const void* srcBuffer, size_t srcSize,
+            const size_t* fileSizes, unsigned nbFiles,
+            const int cLevel, const ZSTD_compressionParameters* comprParams,
+            const void* dictBuffer, size_t dictBufferSize,
+            ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
+            int displayLevel, const char* displayName,
+            const BMK_advancedParams_t* adv)
+{
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize);  /* avoid div by 0 */
+    BMK_benchResult_t benchResult;
     size_t const loadedCompressedSize = srcSize;
     size_t cSize = 0;
     double ratio = 0.;
     U32 nbBlocks;
 
-    /* checks */
-    if (!compressedBuffer || !resultBuffer || !blockTable || !ctx || !dctx)
-        EXM_THROW(31, "allocation error : not enough memory");
+    assert(cctx != NULL); assert(dctx != NULL);
 
     /* init */
-    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* display last 17 characters */
-    if (g_nbWorkers==1) g_nbWorkers=0;   /* prefer synchronous mode */
-
-    if (g_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
+    memset(&benchResult, 0, sizeof(benchResult));
+    if (strlen(displayName)>17) displayName += strlen(displayName) - 17;   /* display last 17 characters */
+    if (adv->mode == BMK_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
         const char* srcPtr = (const char*)srcBuffer;
         U64 totalDSize64 = 0;
         U32 fileNb;
         for (fileNb=0; fileNb<nbFiles; fileNb++) {
             U64 const fSize64 = ZSTD_findDecompressedSize(srcPtr, fileSizes[fileNb]);
-            if (fSize64==0) EXM_THROW(32, "Impossible to determine original size ");
+            if (fSize64==0) RETURN_ERROR(32, BMK_benchOutcome_t, "Impossible to determine original size ");
             totalDSize64 += fSize64;
             srcPtr += fileSizes[fileNb];
         }
         {   size_t const decodedSize = (size_t)totalDSize64;
-            if (totalDSize64 > decodedSize) EXM_THROW(32, "original size is too large");   /* size_t overflow */
-            free(resultBuffer);
-            resultBuffer = malloc(decodedSize);
-            if (!resultBuffer) EXM_THROW(33, "not enough memory");
+            assert((U64)decodedSize == totalDSize64);   /* check overflow */
+            free(*resultBufferPtr);
+            *resultBufferPtr = malloc(decodedSize);
+            if (!(*resultBufferPtr)) {
+                RETURN_ERROR(33, BMK_benchOutcome_t, "not enough memory");
+            }
+            if (totalDSize64 > decodedSize) {  /* size_t overflow */
+                free(*resultBufferPtr);
+                RETURN_ERROR(32, BMK_benchOutcome_t, "original size is too large");
+            }
             cSize = srcSize;
             srcSize = decodedSize;
             ratio = (double)srcSize / (double)cSize;
-    }   }
+        }
+    }
 
-    /* Init blockTable data */
+    /* Init data blocks  */
     {   const char* srcPtr = (const char*)srcBuffer;
         char* cPtr = (char*)compressedBuffer;
-        char* resPtr = (char*)resultBuffer;
+        char* resPtr = (char*)(*resultBufferPtr);
         U32 fileNb;
         for (nbBlocks=0, fileNb=0; fileNb<nbFiles; fileNb++) {
             size_t remaining = fileSizes[fileNb];
-            U32 const nbBlocksforThisFile = g_decodeOnly ? 1 : (U32)((remaining + (blockSize-1)) / blockSize);
+            U32 const nbBlocksforThisFile = (adv->mode == BMK_decodeOnly) ? 1 : (U32)((remaining + (blockSize-1)) / blockSize);
             U32 const blockEnd = nbBlocks + nbBlocksforThisFile;
             for ( ; nbBlocks<blockEnd; nbBlocks++) {
                 size_t const thisBlockSize = MIN(remaining, blockSize);
-                blockTable[nbBlocks].srcPtr = (const void*)srcPtr;
-                blockTable[nbBlocks].srcSize = thisBlockSize;
-                blockTable[nbBlocks].cPtr = (void*)cPtr;
-                blockTable[nbBlocks].cRoom = g_decodeOnly ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
-                blockTable[nbBlocks].cSize = blockTable[nbBlocks].cRoom;
-                blockTable[nbBlocks].resPtr = (void*)resPtr;
-                blockTable[nbBlocks].resSize = g_decodeOnly ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
+                srcPtrs[nbBlocks] = srcPtr;
+                srcSizes[nbBlocks] = thisBlockSize;
+                cPtrs[nbBlocks] = cPtr;
+                cCapacities[nbBlocks] = (adv->mode == BMK_decodeOnly) ? thisBlockSize : ZSTD_compressBound(thisBlockSize);
+                resPtrs[nbBlocks] = resPtr;
+                resSizes[nbBlocks] = (adv->mode == BMK_decodeOnly) ? (size_t) ZSTD_findDecompressedSize(srcPtr, thisBlockSize) : thisBlockSize;
                 srcPtr += thisBlockSize;
-                cPtr += blockTable[nbBlocks].cRoom;
+                cPtr += cCapacities[nbBlocks];
                 resPtr += thisBlockSize;
                 remaining -= thisBlockSize;
-    }   }   }
+            }
+        }
+    }
 
-    /* warmimg up memory */
-    if (g_decodeOnly) {
+    /* warmimg up `compressedBuffer` */
+    if (adv->mode == BMK_decodeOnly) {
         memcpy(compressedBuffer, srcBuffer, loadedCompressedSize);
     } else {
         RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.50, 1);
     }
 
     /* Bench */
-    {   U64 fastestC = (U64)(-1LL), fastestD = (U64)(-1LL);
-        U64 const crcOrig = g_decodeOnly ? 0 : XXH64(srcBuffer, srcSize, 0);
-        UTIL_time_t coolTime;
-        U64 const maxTime = (g_nbSeconds * TIMELOOP_NANOSEC) + 1;
-        U32 nbDecodeLoops = (U32)((100 MB) / (srcSize+1)) + 1;  /* initial conservative speed estimate */
-        U32 nbCompressionLoops = (U32)((2 MB) / (srcSize+1)) + 1;  /* initial conservative speed estimate */
-        U64 totalCTime=0, totalDTime=0;
-        U32 cCompleted=g_decodeOnly, dCompleted=0;
+    {   U64 const crcOrig = (adv->mode == BMK_decodeOnly) ? 0 : XXH64(srcBuffer, srcSize, 0);
 #       define NB_MARKS 4
-        const char* const marks[NB_MARKS] = { " |", " /", " =",  "\\" };
+        const char* marks[NB_MARKS] = { " |", " /", " =", " \\" };
         U32 markNb = 0;
+        int compressionCompleted = (adv->mode == BMK_decodeOnly);
+        int decompressionCompleted = (adv->mode == BMK_compressOnly);
+        BMK_initCCtxArgs cctxprep;
+        BMK_initDCtxArgs dctxprep;
+        cctxprep.cctx = cctx;
+        cctxprep.dictBuffer = dictBuffer;
+        cctxprep.dictBufferSize = dictBufferSize;
+        cctxprep.cLevel = cLevel;
+        cctxprep.comprParams = comprParams;
+        cctxprep.adv = adv;
+        dctxprep.dctx = dctx;
+        dctxprep.dictBuffer = dictBuffer;
+        dctxprep.dictBufferSize = dictBufferSize;
 
-        coolTime = UTIL_getTime();
-        DISPLAYLEVEL(2, "\r%79s\r", "");
-        while (!cCompleted || !dCompleted) {
+        DISPLAYLEVEL(2, "\r%70s\r", "");   /* blank line */
+        DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
 
-            /* overheat protection */
-            if (UTIL_clockSpanMicro(coolTime) > ACTIVEPERIOD_MICROSEC) {
-                DISPLAYLEVEL(2, "\rcooling down ...    \r");
-                UTIL_sleep(COOLPERIOD_SEC);
-                coolTime = UTIL_getTime();
-            }
+        while (!(compressionCompleted && decompressionCompleted)) {
 
-            if (!g_decodeOnly) {
-                /* Compression */
-                DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
-                if (!cCompleted) memset(compressedBuffer, 0xE5, maxCompressedSize);  /* warm up and erase result buffer */
+            if (!compressionCompleted) {
+                BMK_runOutcome_t const cOutcome =
+                        BMK_benchTimedFn( timeStateCompress,
+                                        &local_defaultCompress, cctx,
+                                        &local_initCCtx, &cctxprep,
+                                        nbBlocks,
+                                        srcPtrs, srcSizes,
+                                        cPtrs, cCapacities,
+                                        cSizes);
 
-                UTIL_sleepMilli(5);  /* give processor time to other processes */
-                UTIL_waitForNextTick();
-
-                if (!cCompleted) {   /* still some time to do compression tests */
-                    U32 nbLoops = 0;
-                    UTIL_time_t const clockStart = UTIL_getTime();
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, g_nbWorkers);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionLevel, cLevel);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_enableLongDistanceMatching, g_ldmFlag);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmMinMatch, g_ldmMinMatch);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashLog, g_ldmHashLog);
-                    if (g_ldmBucketSizeLog != BMK_LDM_PARAM_NOTSET) {
-                      ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmBucketSizeLog, g_ldmBucketSizeLog);
-                    }
-                    if (g_ldmHashEveryLog != BMK_LDM_PARAM_NOTSET) {
-                      ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashEveryLog, g_ldmHashEveryLog);
-                    }
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_windowLog, comprParams->windowLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_hashLog, comprParams->hashLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_chainLog, comprParams->chainLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_searchLog, comprParams->searchLog);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_minMatch, comprParams->searchLength);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_targetLength, comprParams->targetLength);
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionStrategy, comprParams->strategy);
-                    ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize);
-
-                    if (!g_nbSeconds) nbCompressionLoops=1;
-                    for (nbLoops=0; nbLoops<nbCompressionLoops; nbLoops++) {
-                        U32 blockNb;
-                        for (blockNb=0; blockNb<nbBlocks; blockNb++) {
-#if 0   /* direct compression function, for occasional comparison */
-                            ZSTD_parameters const params = ZSTD_getParams(cLevel, blockTable[blockNb].srcSize, dictBufferSize);
-                            blockTable[blockNb].cSize = ZSTD_compress_advanced(ctx,
-                                                            blockTable[blockNb].cPtr, blockTable[blockNb].cRoom,
-                                                            blockTable[blockNb].srcPtr, blockTable[blockNb].srcSize,
-                                                            dictBuffer, dictBufferSize,
-                                                            params);
-#else
-                            size_t moreToFlush = 1;
-                            ZSTD_outBuffer out;
-                            ZSTD_inBuffer in;
-                            in.src = blockTable[blockNb].srcPtr;
-                            in.size = blockTable[blockNb].srcSize;
-                            in.pos = 0;
-                            out.dst = blockTable[blockNb].cPtr;
-                            out.size = blockTable[blockNb].cRoom;
-                            out.pos = 0;
-                            while (moreToFlush) {
-                                moreToFlush = ZSTD_compress_generic(ctx,
-                                                    &out, &in, ZSTD_e_end);
-                                if (ZSTD_isError(moreToFlush))
-                                    EXM_THROW(1, "ZSTD_compress_generic() error : %s",
-                                                ZSTD_getErrorName(moreToFlush));
-                            }
-                            blockTable[blockNb].cSize = out.pos;
-#endif
-                    }   }
-                    {   U64 const loopDuration = UTIL_clockSpanNano(clockStart);
-                        if (loopDuration > 0) {
-                            if (loopDuration < fastestC * nbCompressionLoops)
-                                fastestC = loopDuration / nbCompressionLoops;
-                            nbCompressionLoops = (U32)(TIMELOOP_NANOSEC / fastestC) + 1;
-                        } else {
-                            assert(nbCompressionLoops < 40000000);  /* avoid overflow */
-                            nbCompressionLoops *= 100;
-                        }
-                        totalCTime += loopDuration;
-                        cCompleted = (totalCTime >= maxTime);  /* end compression tests */
-                }   }
-
-                cSize = 0;
-                { U32 blockNb; for (blockNb=0; blockNb<nbBlocks; blockNb++) cSize += blockTable[blockNb].cSize; }
-                ratio = (double)srcSize / (double)cSize;
-                markNb = (markNb+1) % NB_MARKS;
-                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                    double const compressionSpeed = ((double)srcSize / fastestC) * 1000;
-                    int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
-                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
-                            marks[markNb], displayName, (U32)srcSize, (U32)cSize,
-                            ratioAccuracy, ratio,
-                            cSpeedAccuracy, compressionSpeed );
+                if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                    return BMK_benchOutcome_error();
                 }
-            }  /* if (!g_decodeOnly) */
 
-#if 0       /* disable decompression test */
-            dCompleted=1;
-            (void)totalDTime; (void)fastestD; (void)crcOrig;   /* unused when decompression disabled */
-#else
-            /* Decompression */
-            if (!dCompleted) memset(resultBuffer, 0xD6, srcSize);  /* warm result buffer */
-
-            UTIL_sleepMilli(5); /* give processor time to other processes */
-            UTIL_waitForNextTick();
-
-            if (!dCompleted) {
-                U32 nbLoops = 0;
-                ZSTD_DDict* const ddict = ZSTD_createDDict(dictBuffer, dictBufferSize);
-                UTIL_time_t const clockStart = UTIL_getTime();
-                if (!ddict) EXM_THROW(2, "ZSTD_createDDict() allocation failure");
-                if (!g_nbSeconds) nbDecodeLoops = 1;
-                for (nbLoops=0; nbLoops < nbDecodeLoops; nbLoops++) {
-                    U32 blockNb;
-                    for (blockNb=0; blockNb<nbBlocks; blockNb++) {
-                        size_t const regenSize = ZSTD_decompress_usingDDict(dctx,
-                            blockTable[blockNb].resPtr, blockTable[blockNb].resSize,
-                            blockTable[blockNb].cPtr, blockTable[blockNb].cSize,
-                            ddict);
-                        if (ZSTD_isError(regenSize)) {
-                            EXM_THROW(2, "ZSTD_decompress_usingDDict() failed on block %u of size %u : %s  \n",
-                                      blockNb, (U32)blockTable[blockNb].cSize, ZSTD_getErrorName(regenSize));
-                        }
-                        blockTable[blockNb].resSize = regenSize;
+                {   BMK_runTime_t const cResult = BMK_extract_runTime(cOutcome);
+                    cSize = cResult.sumOfReturn;
+                    ratio = (double)srcSize / cSize;
+                    {   BMK_benchResult_t newResult;
+                        newResult.cSpeed = ((U64)srcSize * TIMELOOP_NANOSEC / cResult.nanoSecPerRun);
+                        benchResult.cSize = cSize;
+                        if (newResult.cSpeed > benchResult.cSpeed)
+                            benchResult.cSpeed = newResult.cSpeed;
                 }   }
-                ZSTD_freeDDict(ddict);
-                {   U64 const loopDuration = UTIL_clockSpanNano(clockStart);
-                    if (loopDuration > 0) {
-                        if (loopDuration < fastestD * nbDecodeLoops)
-                            fastestD = loopDuration / nbDecodeLoops;
-                        nbDecodeLoops = (U32)(TIMELOOP_NANOSEC / fastestD) + 1;
-                    } else {
-                        assert(nbDecodeLoops < 40000000);  /* avoid overflow */
-                        nbDecodeLoops *= 100;
-                    }
-                    totalDTime += loopDuration;
-                    dCompleted = (totalDTime >= maxTime);
-            }   }
 
-            markNb = (markNb+1) % NB_MARKS;
-            {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                double const compressionSpeed = ((double)srcSize / fastestC) * 1000;
-                int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
-                double const decompressionSpeed = ((double)srcSize / fastestD) * 1000;
-                DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
-                        marks[markNb], displayName, (U32)srcSize, (U32)cSize,
-                        ratioAccuracy, ratio,
-                        cSpeedAccuracy, compressionSpeed,
-                        decompressionSpeed);
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    markNb = (markNb+1) % NB_MARKS;
+                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
+                            marks[markNb], displayName,
+                            (U32)srcSize, (U32)cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT);
+                }
+                compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
             }
 
-            /* CRC Checking */
-            {   U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
-                if (!g_decodeOnly && (crcOrig!=crcCheck)) {
-                    size_t u;
-                    DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
-                    for (u=0; u<srcSize; u++) {
-                        if (((const BYTE*)srcBuffer)[u] != ((const BYTE*)resultBuffer)[u]) {
-                            U32 segNb, bNb, pos;
-                            size_t bacc = 0;
-                            DISPLAY("Decoding error at pos %u ", (U32)u);
-                            for (segNb = 0; segNb < nbBlocks; segNb++) {
-                                if (bacc + blockTable[segNb].srcSize > u) break;
-                                bacc += blockTable[segNb].srcSize;
-                            }
-                            pos = (U32)(u - bacc);
-                            bNb = pos / (128 KB);
-                            DISPLAY("(sample %u, block %u, pos %u) \n", segNb, bNb, pos);
-                            if (u>5) {
-                                int n;
-                                DISPLAY("origin: ");
-                                for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
-                                DISPLAY(" :%02X:  ", ((const BYTE*)srcBuffer)[u]);
-                                for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
-                                DISPLAY(" \n");
-                                DISPLAY("decode: ");
-                                for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)resultBuffer)[u+n]);
-                                DISPLAY(" :%02X:  ", ((const BYTE*)resultBuffer)[u]);
-                                for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)resultBuffer)[u+n]);
-                                DISPLAY(" \n");
-                            }
-                            break;
-                        }
-                        if (u==srcSize-1) {  /* should never happen */
-                            DISPLAY("no difference detected\n");
-                    }   }
-                    break;
-            }   }   /* CRC Checking */
-#endif
-        }   /* for (testNb = 1; testNb <= (g_nbSeconds + !g_nbSeconds); testNb++) */
+            if(!decompressionCompleted) {
+                BMK_runOutcome_t const dOutcome =
+                        BMK_benchTimedFn(timeStateDecompress,
+                                        &local_defaultDecompress, dctx,
+                                        &local_initDCtx, &dctxprep,
+                                        nbBlocks,
+                                        (const void *const *)cPtrs, cSizes,
+                                        resPtrs, resSizes,
+                                        NULL);
 
-        if (g_displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
-            double const cSpeed = ((double)srcSize / fastestC) * 1000;
-            double const dSpeed = ((double)srcSize / fastestD) * 1000;
-            if (g_additionalParam)
-                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, g_additionalParam);
-            else
+                if(!BMK_isSuccessful_runOutcome(dOutcome)) {
+                    return BMK_benchOutcome_error();
+                }
+
+                {   BMK_runTime_t const dResult = BMK_extract_runTime(dOutcome);
+                    U64 const newDSpeed = (srcSize * TIMELOOP_NANOSEC / dResult.nanoSecPerRun);
+                    if (newDSpeed > benchResult.dSpeed)
+                        benchResult.dSpeed = newDSpeed;
+                }
+
+                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
+                    markNb = (markNb+1) % NB_MARKS;
+                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
+                            marks[markNb], displayName,
+                            (U32)srcSize, (U32)benchResult.cSize,
+                            ratioAccuracy, ratio,
+                            benchResult.cSpeed < (10 MB) ? 2 : 1, (double)benchResult.cSpeed / MB_UNIT,
+                            (double)benchResult.dSpeed / MB_UNIT);
+                }
+                decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+            }
+        }   /* while (!(compressionCompleted && decompressionCompleted)) */
+
+        /* CRC Checking */
+        {   const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr);
+            U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
+            if ((adv->mode == BMK_both) && (crcOrig!=crcCheck)) {
+                size_t u;
+                DISPLAY("!!! WARNING !!! %14s : Invalid Checksum : %x != %x   \n", displayName, (unsigned)crcOrig, (unsigned)crcCheck);
+                for (u=0; u<srcSize; u++) {
+                    if (((const BYTE*)srcBuffer)[u] != resultBuffer[u]) {
+                        U32 segNb, bNb, pos;
+                        size_t bacc = 0;
+                        DISPLAY("Decoding error at pos %u ", (U32)u);
+                        for (segNb = 0; segNb < nbBlocks; segNb++) {
+                            if (bacc + srcSizes[segNb] > u) break;
+                            bacc += srcSizes[segNb];
+                        }
+                        pos = (U32)(u - bacc);
+                        bNb = pos / (128 KB);
+                        DISPLAY("(sample %u, block %u, pos %u) \n", segNb, bNb, pos);
+                        if (u>5) {
+                            int n;
+                            DISPLAY("origin: ");
+                            for (n=-5; n<0; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
+                            DISPLAY(" :%02X:  ", ((const BYTE*)srcBuffer)[u]);
+                            for (n=1; n<3; n++) DISPLAY("%02X ", ((const BYTE*)srcBuffer)[u+n]);
+                            DISPLAY(" \n");
+                            DISPLAY("decode: ");
+                            for (n=-5; n<0; n++) DISPLAY("%02X ", resultBuffer[u+n]);
+                            DISPLAY(" :%02X:  ", resultBuffer[u]);
+                            for (n=1; n<3; n++) DISPLAY("%02X ", resultBuffer[u+n]);
+                            DISPLAY(" \n");
+                        }
+                        break;
+                    }
+                    if (u==srcSize-1) {  /* should never happen */
+                        DISPLAY("no difference detected\n");
+                    }
+                }
+            }
+        }   /* CRC Checking */
+
+        if (displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
+            double const cSpeed = (double)benchResult.cSpeed / MB_UNIT;
+            double const dSpeed = (double)benchResult.dSpeed / MB_UNIT;
+            if (adv->additionalParam) {
+                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, adv->additionalParam);
+            } else {
                 DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName);
+            }
         }
+
         DISPLAYLEVEL(2, "%2i#\n", cLevel);
     }   /* Bench */
 
-    /* clean up */
-    free(blockTable);
-    free(compressedBuffer);
-    free(resultBuffer);
-    ZSTD_freeCCtx(ctx);
-    ZSTD_freeDCtx(dctx);
-    return 0;
+    benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx);
+    return BMK_benchOutcome_setValidResult(benchResult);
 }
 
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName, const BMK_advancedParams_t* adv)
+
+{
+    int const dstParamsError = !dstBuffer ^ !dstCapacity;  /* must be both NULL or none */
+
+    size_t const blockSize = ((adv->blockSize>=32 && (adv->mode != BMK_decodeOnly)) ? adv->blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
+    U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
+
+    /* these are the blockTable parameters, just split up */
+    const void ** const srcPtrs = (const void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const srcSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+
+    void ** const cPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const cSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+    size_t* const cCapacities = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    void ** const resPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
+    size_t* const resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+
+    const size_t maxCompressedSize = dstCapacity ? dstCapacity : ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);
+
+    void* const internalDstBuffer = dstBuffer ? NULL : malloc(maxCompressedSize);
+    void* const compressedBuffer = dstBuffer ? dstBuffer : internalDstBuffer;
+
+    BMK_benchOutcome_t outcome = BMK_benchOutcome_error();  /* error by default */
+
+    void* resultBuffer = srcSize ? malloc(srcSize) : NULL;
+
+    int allocationincomplete = !srcPtrs || !srcSizes || !cPtrs ||
+        !cSizes || !cCapacities || !resPtrs || !resSizes ||
+        !timeStateCompress || !timeStateDecompress ||
+        !cctx || !dctx ||
+        !compressedBuffer || !resultBuffer;
+
+
+    if (!allocationincomplete && !dstParamsError) {
+        outcome = BMK_benchMemAdvancedNoAlloc(srcPtrs, srcSizes,
+                                            cPtrs, cCapacities, cSizes,
+                                            resPtrs, resSizes,
+                                            &resultBuffer,
+                                            compressedBuffer, maxCompressedSize,
+                                            timeStateCompress, timeStateDecompress,
+                                            srcBuffer, srcSize,
+                                            fileSizes, nbFiles,
+                                            cLevel, comprParams,
+                                            dictBuffer, dictBufferSize,
+                                            cctx, dctx,
+                                            displayLevel, displayName, adv);
+    }
+
+    /* clean up */
+    BMK_freeTimedFnState(timeStateCompress);
+    BMK_freeTimedFnState(timeStateDecompress);
+
+    ZSTD_freeCCtx(cctx);
+    ZSTD_freeDCtx(dctx);
+
+    free(internalDstBuffer);
+    free(resultBuffer);
+
+    free((void*)srcPtrs);
+    free(srcSizes);
+    free(cPtrs);
+    free(cSizes);
+    free(cCapacities);
+    free(resPtrs);
+    free(resSizes);
+
+    if(allocationincomplete) {
+        RETURN_ERROR(31, BMK_benchOutcome_t, "allocation error : not enough memory");
+    }
+
+    if(dstParamsError) {
+        RETURN_ERROR(32, BMK_benchOutcome_t, "Dst parameters not coherent");
+    }
+    return outcome;
+}
+
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName) {
+
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
+    return BMK_benchMemAdvanced(srcBuffer, srcSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, &adv);
+}
+
+static BMK_benchOutcome_t BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
+                            const size_t* fileSizes, unsigned nbFiles,
+                            int cLevel, const ZSTD_compressionParameters* comprParams,
+                            const void* dictBuffer, size_t dictBufferSize,
+                            int displayLevel, const char* displayName,
+                            BMK_advancedParams_t const * const adv)
+{
+    const char* pch = strrchr(displayName, '\\'); /* Windows */
+    if (!pch) pch = strrchr(displayName, '/');    /* Linux */
+    if (pch) displayName = pch+1;
+
+    if (adv->realTime) {
+        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
+        SET_REALTIME_PRIORITY;
+    }
+
+    if (displayLevel == 1 && !adv->additionalParam)   /* --quiet mode */
+        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n",
+                ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING,
+                (U32)benchedSize, adv->nbSeconds, (U32)(adv->blockSize>>10));
+
+    return BMK_benchMemAdvanced(srcBuffer, benchedSize,
+                                NULL, 0,
+                                fileSizes, nbFiles,
+                                cLevel, comprParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName, adv);
+}
+
+BMK_benchOutcome_t BMK_syntheticTest(int cLevel, double compressibility,
+                          const ZSTD_compressionParameters* compressionParams,
+                          int displayLevel, const BMK_advancedParams_t* adv)
+{
+    char name[20] = {0};
+    size_t const benchedSize = 10000000;
+    void* srcBuffer;
+    BMK_benchOutcome_t res;
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
+    }
+
+    /* Memory allocation */
+    srcBuffer = malloc(benchedSize);
+    if (!srcBuffer) RETURN_ERROR(21, BMK_benchOutcome_t, "not enough memory");
+
+    /* Fill input buffer */
+    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
+
+    /* Bench */
+    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
+    res = BMK_benchCLevel(srcBuffer, benchedSize,
+                    &benchedSize /* ? */, 1 /* ? */,
+                    cLevel, compressionParams,
+                    NULL, 0,  /* dictionary */
+                    displayLevel, name, adv);
+
+    /* clean up */
+    free(srcBuffer);
+
+    return res;
+}
+
+
 
 static size_t BMK_findMaxMem(U64 requiredMem)
 {
@@ -508,48 +941,19 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     do {
         testmem = (BYTE*)malloc((size_t)requiredMem);
         requiredMem -= step;
-    } while (!testmem);
+    } while (!testmem && requiredMem > 0);
 
     free(testmem);
     return (size_t)(requiredMem);
 }
 
-static void BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
-                            const char* displayName, int cLevel, int cLevelLast,
-                            const size_t* fileSizes, unsigned nbFiles,
-                            const void* dictBuffer, size_t dictBufferSize,
-                            const ZSTD_compressionParameters* const compressionParams)
-{
-    int l;
-
-    const char* pch = strrchr(displayName, '\\'); /* Windows */
-    if (!pch) pch = strrchr(displayName, '/'); /* Linux */
-    if (pch) displayName = pch+1;
-
-    if (g_realTime) {
-        DISPLAYLEVEL(2, "Note : switching to real-time priority \n");
-        SET_REALTIME_PRIORITY;
-    }
-
-    if (g_displayLevel == 1 && !g_additionalParam)
-        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n", ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING, (U32)benchedSize, g_nbSeconds, (U32)(g_blockSize>>10));
-
-    for (l=cLevel; l <= cLevelLast; l++) {
-        if (l==0) continue;  /* skip level 0 */
-        BMK_benchMem(srcBuffer, benchedSize,
-                     displayName, l,
-                     fileSizes, nbFiles,
-                     dictBuffer, dictBufferSize, compressionParams);
-    }
-}
-
-
 /*! BMK_loadFiles() :
  *  Loads `buffer` with content of files listed within `fileNamesTable`.
  *  At most, fills `buffer` entirely. */
-static void BMK_loadFiles(void* buffer, size_t bufferSize,
-                          size_t* fileSizes,
-                          const char* const * const fileNamesTable, unsigned nbFiles)
+static int BMK_loadFiles(void* buffer, size_t bufferSize,
+                         size_t* fileSizes,
+                         const char* const * fileNamesTable, unsigned nbFiles,
+                         int displayLevel)
 {
     size_t pos = 0, totalSize = 0;
     unsigned n;
@@ -567,45 +971,69 @@ static void BMK_loadFiles(void* buffer, size_t bufferSize,
             continue;
         }
         f = fopen(fileNamesTable[n], "rb");
-        if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
+        if (f==NULL) EXM_THROW_INT(10, "impossible to open file %s", fileNamesTable[n]);
         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[n]);
         if (fileSize > bufferSize-pos) fileSize = bufferSize-pos, nbFiles=n;   /* buffer too small - stop after this file */
-        { size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
-          if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
-          pos += readSize; }
+        {   size_t const readSize = fread(((char*)buffer)+pos, 1, (size_t)fileSize, f);
+            if (readSize != (size_t)fileSize) EXM_THROW_INT(11, "could not read %s", fileNamesTable[n]);
+            pos += readSize;
+        }
         fileSizes[n] = (size_t)fileSize;
         totalSize += (size_t)fileSize;
         fclose(f);
     }
 
-    if (totalSize == 0) EXM_THROW(12, "no data to bench");
+    if (totalSize == 0) EXM_THROW_INT(12, "no data to bench");
+    return 0;
 }
 
-static void BMK_benchFileTable(const char* const * const fileNamesTable, unsigned const nbFiles,
-                               const char* const dictFileName,
-                               int const cLevel, int const cLevelLast,
-                               const ZSTD_compressionParameters* const compressionParams)
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                        const char* const * fileNamesTable, unsigned nbFiles,
+                        const char* dictFileName, int cLevel,
+                        const ZSTD_compressionParameters* compressionParams,
+                        int displayLevel, const BMK_advancedParams_t* adv)
 {
-    void* srcBuffer;
+    void* srcBuffer = NULL;
     size_t benchedSize;
     void* dictBuffer = NULL;
     size_t dictBufferSize = 0;
-    size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
+    size_t* fileSizes = NULL;
+    BMK_benchOutcome_t res;
     U64 const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
 
-    if (!fileSizes) EXM_THROW(12, "not enough memory for fileSizes");
+    if (!nbFiles) {
+        RETURN_ERROR(14, BMK_benchOutcome_t, "No Files to Benchmark");
+    }
+
+    if (cLevel > ZSTD_maxCLevel()) {
+        RETURN_ERROR(15, BMK_benchOutcome_t, "Invalid Compression Level");
+    }
+
+    fileSizes = (size_t*)calloc(nbFiles, sizeof(size_t));
+    if (!fileSizes) RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory for fileSizes");
 
     /* Load dictionary */
     if (dictFileName != NULL) {
         U64 const dictFileSize = UTIL_getFileSize(dictFileName);
-        if (dictFileSize > 64 MB)
-            EXM_THROW(10, "dictionary file %s too large", dictFileName);
+        if (dictFileSize > 64 MB) {
+            free(fileSizes);
+            RETURN_ERROR(10, BMK_benchOutcome_t, "dictionary file %s too large", dictFileName);
+        }
         dictBufferSize = (size_t)dictFileSize;
         dictBuffer = malloc(dictBufferSize);
-        if (dictBuffer==NULL)
-            EXM_THROW(11, "not enough memory for dictionary (%u bytes)",
+        if (dictBuffer==NULL) {
+            free(fileSizes);
+            RETURN_ERROR(11, BMK_benchOutcome_t, "not enough memory for dictionary (%u bytes)",
                             (U32)dictBufferSize);
-        BMK_loadFiles(dictBuffer, dictBufferSize, fileSizes, &dictFileName, 1);
+        }
+
+        {   int const errorCode = BMK_loadFiles(dictBuffer, dictBufferSize,
+                                                fileSizes, &dictFileName /*?*/,
+                                                1 /*?*/, displayLevel);
+            if (errorCode) {
+                res = BMK_benchOutcome_error();
+                goto _cleanUp;
+        }   }
     }
 
     /* Memory allocation & restrictions */
@@ -613,79 +1041,49 @@ static void BMK_benchFileTable(const char* const * const fileNamesTable, unsigne
     if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
     if (benchedSize < totalSizeToLoad)
         DISPLAY("Not enough memory; testing %u MB only...\n", (U32)(benchedSize >> 20));
-    srcBuffer = malloc(benchedSize);
-    if (!srcBuffer) EXM_THROW(12, "not enough memory");
+
+    srcBuffer = benchedSize ? malloc(benchedSize) : NULL;
+    if (!srcBuffer) {
+        free(dictBuffer);
+        free(fileSizes);
+        RETURN_ERROR(12, BMK_benchOutcome_t, "not enough memory");
+    }
 
     /* Load input buffer */
-    BMK_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
-
-    /* Bench */
-    if (g_separateFiles) {
-        const BYTE* srcPtr = (const BYTE*)srcBuffer;
-        U32 fileNb;
-        for (fileNb=0; fileNb<nbFiles; fileNb++) {
-            size_t const fileSize = fileSizes[fileNb];
-            BMK_benchCLevel(srcPtr, fileSize,
-                            fileNamesTable[fileNb], cLevel, cLevelLast,
-                            fileSizes+fileNb, 1,
-                            dictBuffer, dictBufferSize, compressionParams);
-            srcPtr += fileSize;
-        }
-    } else {
-        char mfName[20] = {0};
-        snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
-        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
-            BMK_benchCLevel(srcBuffer, benchedSize,
-                            displayName, cLevel, cLevelLast,
-                            fileSizes, nbFiles,
-                            dictBuffer, dictBufferSize, compressionParams);
+    {   int const errorCode = BMK_loadFiles(srcBuffer, benchedSize,
+                                        fileSizes, fileNamesTable, nbFiles,
+                                        displayLevel);
+        if (errorCode) {
+            res = BMK_benchOutcome_error();
+            goto _cleanUp;
     }   }
 
-    /* clean up */
+    /* Bench */
+    {   char mfName[20] = {0};
+        snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
+        {   const char* const displayName = (nbFiles > 1) ? mfName : fileNamesTable[0];
+            res = BMK_benchCLevel(srcBuffer, benchedSize,
+                                fileSizes, nbFiles,
+                                cLevel, compressionParams,
+                                dictBuffer, dictBufferSize,
+                                displayLevel, displayName,
+                                adv);
+    }   }
+
+_cleanUp:
     free(srcBuffer);
     free(dictBuffer);
     free(fileSizes);
+    return res;
 }
 
 
-static void BMK_syntheticTest(int cLevel, int cLevelLast, double compressibility,
-                              const ZSTD_compressionParameters* compressionParams)
+BMK_benchOutcome_t BMK_benchFiles(
+                    const char* const * fileNamesTable, unsigned nbFiles,
+                    const char* dictFileName,
+                    int cLevel, const ZSTD_compressionParameters* compressionParams,
+                    int displayLevel)
 {
-    char name[20] = {0};
-    size_t benchedSize = 10000000;
-    void* const srcBuffer = malloc(benchedSize);
-
-    /* Memory allocation */
-    if (!srcBuffer) EXM_THROW(21, "not enough memory");
-
-    /* Fill input buffer */
-    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
-
-    /* Bench */
-    snprintf (name, sizeof(name), "Synthetic %2u%%", (unsigned)(compressibility*100));
-    BMK_benchCLevel(srcBuffer, benchedSize, name, cLevel, cLevelLast, &benchedSize, 1, NULL, 0, compressionParams);
-
-    /* clean up */
-    free(srcBuffer);
-}
-
-
-int BMK_benchFiles(const char** fileNamesTable, unsigned nbFiles,
-                   const char* dictFileName,
-                   int cLevel, int cLevelLast,
-                   const ZSTD_compressionParameters* compressionParams)
-{
-    double const compressibility = (double)g_compressibilityDefault / 100;
-
-    if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
-    if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
-    if (cLevelLast < cLevel) cLevelLast = cLevel;
-    if (cLevelLast > cLevel)
-        DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
-
-    if (nbFiles == 0)
-        BMK_syntheticTest(cLevel, cLevelLast, compressibility, compressionParams);
-    else
-        BMK_benchFileTable(fileNamesTable, nbFiles, dictFileName, cLevel, cLevelLast, compressionParams);
-    return 0;
+    BMK_advancedParams_t const adv = BMK_initAdvancedParams();
+    return BMK_benchFilesAdvanced(fileNamesTable, nbFiles, dictFileName, cLevel, compressionParams, displayLevel, &adv);
 }
diff --git a/programs/bench.h b/programs/bench.h
index bf1087013feb..13ca5b50b461 100644
--- a/programs/bench.h
+++ b/programs/bench.h
@@ -8,30 +8,296 @@
  * You may select, at your option, one of the above-listed licenses.
  */
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
 
 #ifndef BENCH_H_121279284357
 #define BENCH_H_121279284357
 
+/* ===  Dependencies  === */
 #include <stddef.h>   /* size_t */
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressionParameters */
 #include "zstd.h"     /* ZSTD_compressionParameters */
 
-int BMK_benchFiles(const char** fileNamesTable, unsigned nbFiles, const char* dictFileName,
-                   int cLevel, int cLevelLast, const ZSTD_compressionParameters* compressionParams);
 
-/* Set Parameters */
-void BMK_setNbSeconds(unsigned nbLoops);
-void BMK_setBlockSize(size_t blockSize);
-void BMK_setNbWorkers(unsigned nbWorkers);
-void BMK_setRealTime(unsigned priority);
-void BMK_setNotificationLevel(unsigned level);
-void BMK_setSeparateFiles(unsigned separate);
-void BMK_setAdditionalParam(int additionalParam);
-void BMK_setDecodeOnlyMode(unsigned decodeFlag);
-void BMK_setLdmFlag(unsigned ldmFlag);
-void BMK_setLdmMinMatch(unsigned ldmMinMatch);
-void BMK_setLdmHashLog(unsigned ldmHashLog);
-void BMK_setLdmBucketSizeLog(unsigned ldmBucketSizeLog);
-void BMK_setLdmHashEveryLog(unsigned ldmHashEveryLog);
+/* ===  Constants  === */
+
+#define MB_UNIT 1000000
+
+
+/* ===  Benchmark functions  === */
+
+/* Creates a variant `typeName`, able to express "error or valid result".
+ * Functions with return type `typeName`
+ * must first check if result is valid, using BMK_isSuccessful_*(),
+ * and only then can extract `baseType`.
+ */
+#define VARIANT_ERROR_RESULT(baseType, variantName)  \
+                                             \
+typedef struct {                             \
+    baseType internal_never_use_directly;    \
+    int tag;                                 \
+} variantName
+
+
+typedef struct {
+    size_t cSize;
+    unsigned long long cSpeed;   /* bytes / sec */
+    unsigned long long dSpeed;
+    size_t cMem;                 /* memory usage during compression */
+} BMK_benchResult_t;
+
+VARIANT_ERROR_RESULT(BMK_benchResult_t, BMK_benchOutcome_t);
+
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_benchOutcome(BMK_benchOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_benchOutcome()
+ */
+BMK_benchResult_t BMK_extract_benchResult(BMK_benchOutcome_t outcome);
+
+
+/*! BMK_benchFiles() -- called by zstdcli */
+/*  Loads files from fileNamesTable into memory,
+ *  and an optional dictionary from dictFileName (can be NULL),
+ *  then uses benchMem().
+ *  fileNamesTable - name of files to benchmark.
+ *  nbFiles - number of files (size of fileNamesTable), must be > 0.
+ *  dictFileName - name of dictionary file to load.
+ *  cLevel - compression level to benchmark, errors if invalid.
+ *  compressionParams - advanced compression Parameters.
+ *  displayLevel - what gets printed:
+ *      0 : no display;
+ *      1 : errors;
+ *      2 : + result + interaction + warnings;
+ *      3 : + information;
+ *      4 : + debug
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchFiles(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
+                   int displayLevel);
+
+
+typedef enum {
+    BMK_both = 0,
+    BMK_decodeOnly = 1,
+    BMK_compressOnly = 2
+} BMK_mode_t;
+
+typedef struct {
+    BMK_mode_t mode;            /* 0: all, 1: compress only 2: decode only */
+    unsigned nbSeconds;         /* default timing is in nbSeconds */
+    size_t blockSize;           /* Maximum size of each block*/
+    unsigned nbWorkers;         /* multithreading */
+    unsigned realTime;          /* real time priority */
+    int additionalParam;        /* used by python speed benchmark */
+    unsigned ldmFlag;           /* enables long distance matching */
+    unsigned ldmMinMatch;       /* below: parameters for long distance matching, see zstd.1.md */
+    unsigned ldmHashLog;
+    unsigned ldmBucketSizeLog;
+    unsigned ldmHashEveryLog;
+} BMK_advancedParams_t;
+
+/* returns default parameters used by nonAdvanced functions */
+BMK_advancedParams_t BMK_initAdvancedParams(void);
+
+/*! BMK_benchFilesAdvanced():
+ *  Same as BMK_benchFiles(),
+ *  with more controls, provided through advancedParams_t structure */
+BMK_benchOutcome_t BMK_benchFilesAdvanced(
+                   const char* const * fileNamesTable, unsigned nbFiles,
+                   const char* dictFileName,
+                   int cLevel, const ZSTD_compressionParameters* compressionParams,
+                   int displayLevel, const BMK_advancedParams_t* adv);
+
+/*! BMK_syntheticTest() -- called from zstdcli */
+/*  Generates a sample with datagen, using compressibility argument */
+/*  cLevel - compression level to benchmark, errors if invalid
+ *  compressibility - determines compressibility of sample
+ *  compressionParams - basic compression Parameters
+ *  displayLevel - see benchFiles
+ *  adv - see advanced_Params_t
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_syntheticTest(
+                              int cLevel, double compressibility,
+                              const ZSTD_compressionParameters* compressionParams,
+                              int displayLevel, const BMK_advancedParams_t* adv);
+
+
+
+/* ===  Benchmark Zstandard in a memory-to-memory scenario  === */
+
+/** BMK_benchMem() -- core benchmarking function, called in paramgrill
+ *  applies ZSTD_compress_generic() and ZSTD_decompress_generic() on data in srcBuffer
+ *  with specific compression parameters provided by other arguments using benchFunction
+ *  (cLevel, comprParams + adv in advanced Mode) */
+/*  srcBuffer - data source, expected to be valid compressed data if in Decode Only Mode
+ *  srcSize - size of data in srcBuffer
+ *  fileSizes - srcBuffer is considered cut into 1+ segments, to compress separately.
+ *              note : sum(fileSizes) must be == srcSize.  (<== ensure it's properly checked)
+ *  nbFiles - nb of segments
+ *  cLevel - compression level
+ *  comprParams - basic compression parameters
+ *  dictBuffer - a dictionary if used, null otherwise
+ *  dictBufferSize - size of dictBuffer, 0 otherwise
+ *  diplayLevel - see BMK_benchFiles
+ *  displayName - name used by display
+ * @return:
+ *      a variant, which expresses either an error, or a valid result.
+ *      Use BMK_isSuccessful_benchOutcome() to check if function was successful.
+ *      If yes, extract the valid result with BMK_extract_benchResult(),
+ *      it will contain :
+ *          .cSpeed: compression speed in bytes per second,
+ *          .dSpeed: decompression speed in bytes per second,
+ *          .cSize : compressed size, in bytes
+ *          .cMem  : memory budget required for the compression context
+ */
+BMK_benchOutcome_t BMK_benchMem(const void* srcBuffer, size_t srcSize,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName);
+
+/* BMK_benchMemAdvanced() : same as BMK_benchMem()
+ * with following additional options :
+ * dstBuffer - destination buffer to write compressed output in, NULL if none provided.
+ * dstCapacity - capacity of destination buffer, give 0 if dstBuffer = NULL
+ * adv = see advancedParams_t
+ */
+BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
+                        void* dstBuffer, size_t dstCapacity,
+                        const size_t* fileSizes, unsigned nbFiles,
+                        int cLevel, const ZSTD_compressionParameters* comprParams,
+                        const void* dictBuffer, size_t dictBufferSize,
+                        int displayLevel, const char* displayName,
+                        const BMK_advancedParams_t* adv);
+
+
+
+/* ====  Benchmarking any function, iterated on a set of blocks  ==== */
+
+typedef struct {
+    unsigned long long nanoSecPerRun;  /* time per iteration */
+    size_t sumOfReturn;       /* sum of return values */
+} BMK_runTime_t;
+
+VARIANT_ERROR_RESULT(BMK_runTime_t, BMK_runOutcome_t);
+
+/* check first if the return structure represents an error or a valid result */
+int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome);
+
+/* extract result from variant type.
+ * note : this function will abort() program execution if result is not valid
+ *        check result validity first, by using BMK_isSuccessful_runOutcome()
+ */
+BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome);
+
+
+
+typedef size_t (*BMK_benchFn_t)(const void* src, size_t srcSize, void* dst, size_t dstCapacity, void* customPayload);
+typedef size_t (*BMK_initFn_t)(void* initPayload);
+
+
+/* BMK_benchFunction() :
+ * This function times the execution of 2 argument functions, benchFn and initFn  */
+
+/* benchFn - (*benchFn)(srcBuffers[i], srcSizes[i], dstBuffers[i], dstCapacities[i], benchPayload)
+ *      is run nbLoops times
+ * initFn - (*initFn)(initPayload) is run once per benchmark, at the beginning.
+ *      This argument can be NULL, in which case nothing is run.
+ * blockCount - number of blocks. Size of all array parameters : srcBuffers, srcSizes, dstBuffers, dstCapacities, blockResults
+ * srcBuffers - an array of buffers to be operated on by benchFn
+ * srcSizes - an array of the sizes of above buffers
+ * dstBuffers - an array of buffers to be written into by benchFn
+ * dstCapacities - an array of the capacities of above buffers
+ * blockResults - Optional: store the return value of benchFn for each block. Use NULL if this result is not requested.
+ * nbLoops - defines number of times benchFn is run.
+ * @return: a variant, which express either an error, or can generate a valid BMK_runTime_t result.
+ *          Use BMK_isSuccessful_runOutcome() to check if function was successful.
+ *          If yes, extract the result with BMK_extract_runTime(),
+ *          it will contain :
+ *              .sumOfReturn : the sum of all return values of benchFn through all of blocks
+ *              .nanoSecPerRun : time per run of benchFn + (time for initFn / nbLoops)
+ *          .sumOfReturn is generally intended for functions which return a # of bytes written into dstBuffer,
+ *              in which case, this value will be the total amount of bytes written into dstBuffer.
+ */
+BMK_runOutcome_t BMK_benchFunction(
+                        BMK_benchFn_t benchFn, void* benchPayload,
+                        BMK_initFn_t initFn, void* initPayload,
+                        size_t blockCount,
+                        const void *const * srcBuffers, const size_t* srcSizes,
+                        void *const * dstBuffers, const size_t* dstCapacities,
+                        size_t* blockResults,
+                        unsigned nbLoops);
+
+
+
+/* ====  Benchmark any function, providing intermediate results  ==== */
+
+/* state information tracking benchmark session */
+typedef struct BMK_timedFnState_s BMK_timedFnState_t;
+
+/* BMK_createTimedFnState() and BMK_resetTimedFnState() :
+ * Create/Set BMK_timedFnState_t for next benchmark session,
+ * which shall last a minimum of total_ms milliseconds,
+ * producing intermediate results, paced at interval of (approximately) run_ms.
+ */
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms);
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms);
+void BMK_freeTimedFnState(BMK_timedFnState_t* state);
+
+
+/* Tells if duration of all benchmark runs has exceeded total_ms
+ */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);
+
+
+/* BMK_benchTimedFn() :
+ * Similar to BMK_benchFunction(), most arguments being identical.
+ * Automatically determines `nbLoops` so that each result is regularly produced at interval of about run_ms.
+ * Note : minimum `nbLoops` is 1, therefore a run may last more than run_ms, and possibly even more than total_ms.
+ * Usage - initialize timedFnState, select benchmark duration (total_ms) and each measurement duration (run_ms)
+ *         call BMK_benchTimedFn() repetitively, each measurement is supposed to last about run_ms
+ *         Check if total time budget is spent or exceeded, using BMK_isCompleted_TimedFn()
+ */
+BMK_runOutcome_t BMK_benchTimedFn(
+                    BMK_timedFnState_t* timedFnState,
+                    BMK_benchFn_t benchFn, void* benchPayload,
+                    BMK_initFn_t initFn, void* initPayload,
+                    size_t blockCount,
+                    const void *const * srcBlockBuffers, const size_t* srcBlockSizes,
+                    void *const * dstBlockBuffers, const size_t* dstBlockCapacities,
+                    size_t* blockResults);
+
+
+
+
 
 #endif   /* BENCH_H_121279284357 */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/programs/datagen.c b/programs/datagen.c
index a489d6af08d7..c8383658488b 100644
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -13,6 +13,7 @@
 /*-************************************
 *  Dependencies
 **************************************/
+#include "datagen.h"
 #include "platform.h"  /* SET_BINARY_MODE */
 #include <stdlib.h>    /* malloc, free */
 #include <stdio.h>     /* FILE, fwrite, fprintf */
@@ -91,7 +92,7 @@ static U32 RDG_randLength(unsigned* seedPtr)
     return (RDG_rand(seedPtr) & 0x1FF) + 0xF;
 }
 
-void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, const BYTE* ldt, unsigned* seedPtr)
+static void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double matchProba, const BYTE* ldt, unsigned* seedPtr)
 {
     BYTE* const buffPtr = (BYTE*)buffer;
     U32 const matchProba32 = (U32)(32768 * matchProba);
diff --git a/programs/dibio.c b/programs/dibio.c
index 112259ddcd05..d3fd8cc053de 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -27,6 +27,7 @@
 #include <string.h>         /* memset */
 #include <stdio.h>          /* fprintf, fopen, ftello64 */
 #include <errno.h>          /* errno */
+#include <assert.h>
 
 #include "mem.h"            /* read */
 #include "error_private.h"
@@ -43,6 +44,7 @@
 #define SAMPLESIZE_MAX (128 KB)
 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
+#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
@@ -82,10 +84,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 /* ********************************************************
 *  Helper functions
 **********************************************************/
-unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
-
-const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
-
 #undef MIN
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
@@ -165,6 +163,7 @@ static U32 DiB_rand(U32* src)
 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
     U32 seed = 0xFD2FB528;
     unsigned i;
+    assert(nbFiles >= 1);
     for (i = nbFiles - 1; i > 0; --i) {
         unsigned const j = DiB_rand(&seed) % (i + 1);
         const char* const tmp = fileNamesTable[j];
@@ -269,16 +268,19 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCa
 
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover)
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
 {
     unsigned const displayLevel = params ? params->zParams.notificationLevel :
                         coverParams ? coverParams->zParams.notificationLevel :
+                        fastCoverParams ? fastCoverParams->zParams.notificationLevel :
                         0;   /* should never happen */
     void* const dictBuffer = malloc(maxDictSize);
     fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
     size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
+    size_t const memMult = params ? MEMMULT :
+                           coverParams ? COVER_MEMMULT:
+                           FASTCOVER_MEMMULT;
     size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
     size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
     void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
@@ -310,7 +312,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
     /* Load input buffer */
     DISPLAYLEVEL(3, "Shuffling input files\n");
     DiB_shuffle(fileNamesTable, nbFiles);
-    nbFiles = DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
 
     {   size_t dictSize;
         if (params) {
@@ -318,16 +321,36 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
             dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
                                                            srcBuffer, sampleSizes, fs.nbSamples,
                                                            *params);
-        } else if (optimizeCover) {
-            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
-                                                           srcBuffer, sampleSizes, fs.nbSamples,
-                                                           coverParams);
-            if (!ZDICT_isError(dictSize)) {
-                DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
+        } else if (coverParams) {
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                             srcBuffer, sampleSizes, fs.nbSamples,
+                                                             coverParams);
+              if (!ZDICT_isError(dictSize)) {
+                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
+                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
+                              coverParams->steps, splitPercentage);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                                     sampleSizes, fs.nbSamples, *coverParams);
             }
         } else {
-            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                                   sampleSizes, fs.nbSamples, *coverParams);
+            assert(fastCoverParams != NULL);
+            if (optimize) {
+              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
+                                                              srcBuffer, sampleSizes, fs.nbSamples,
+                                                              fastCoverParams);
+              if (!ZDICT_isError(dictSize)) {
+                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
+                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
+                            fastCoverParams->accel);
+              }
+            } else {
+              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
+                                                        sampleSizes, fs.nbSamples, *fastCoverParams);
+            }
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
diff --git a/programs/dibio.h b/programs/dibio.h
index 499e3036520c..ea163fe6afd9 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -33,7 +33,7 @@
 */
 int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                        const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
-                       int optimizeCover);
+                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
 
 #endif
diff --git a/programs/fileio.c b/programs/fileio.c
index 14569bb47528..c24f4defbb9a 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -20,7 +20,6 @@
 #  define _POSIX_SOURCE 1          /* disable %llu warnings with MinGW on Windows */
 #endif
 
-
 /*-*************************************
 *  Includes
 ***************************************/
@@ -29,16 +28,17 @@
 #include <stdio.h>      /* fprintf, fopen, fread, _fileno, stdin, stdout */
 #include <stdlib.h>     /* malloc, free */
 #include <string.h>     /* strcmp, strlen */
+#include <assert.h>
 #include <errno.h>      /* errno */
+#include <signal.h>
 
 #if defined (_MSC_VER)
 #  include <sys/stat.h>
 #  include <io.h>
 #endif
 
-#include "mem.h"
+#include "mem.h"       /* U32, U64 */
 #include "fileio.h"
-#include "util.h"
 
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
 #include "zstd.h"
@@ -70,6 +70,7 @@
 #define MB *(1<<20)
 #define GB *(1U<<30)
 
+#define ADAPT_WINDOWLOG_DEFAULT 23   /* 8 MB */
 #define DICTSIZE_MAX (32 MB)   /* protection against large input (attack scenario) */
 
 #define FNSPACE 30
@@ -101,25 +102,10 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
 
-/*-*************************************
-*  Debug
-***************************************/
-#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
-#  include <assert.h>
-#else
-#  ifndef assert
-#    define assert(condition) ((void)0)
-#  endif
-#endif
-
-#ifndef ZSTD_DEBUG
-#  define ZSTD_DEBUG 0
-#endif
-#define DEBUGLOG(l,...) if (l<=ZSTD_DEBUG) DISPLAY(__VA_ARGS__);
 #define EXM_THROW(error, ...)                                             \
 {                                                                         \
     DISPLAYLEVEL(1, "zstd: ");                                            \
-    DEBUGLOG(1, "Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAYLEVEL(5, "Error defined at %s, line %i : \n", __FILE__, __LINE__); \
     DISPLAYLEVEL(1, "error %i : ", error);                                \
     DISPLAYLEVEL(1, __VA_ARGS__);                                         \
     DISPLAYLEVEL(1, " \n");                                               \
@@ -129,7 +115,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 #define CHECK_V(v, f)                                \
     v = f;                                           \
     if (ZSTD_isError(v)) {                           \
-        DEBUGLOG(1, "%s \n", #f);                    \
+        DISPLAYLEVEL(5, "%s \n", #f);                \
         EXM_THROW(11, "%s", ZSTD_getErrorName(v));   \
     }
 #define CHECK(f) { size_t err; CHECK_V(err, f); }
@@ -138,8 +124,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 /*-************************************
 *  Signal (Ctrl-C trapping)
 **************************************/
-#include  <signal.h>
-
 static const char* g_artefact = NULL;
 static void INThandler(int sig)
 {
@@ -171,8 +155,85 @@ static void clearHandler(void)
 }
 
 
-/* ************************************************************
-* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
+/*-*********************************************************
+*  Termination signal trapping (Print debug stack trace)
+***********************************************************/
+#if defined(__has_feature) && !defined(BACKTRACE_ENABLE) /* Clang compiler */
+#  if (__has_feature(address_sanitizer))
+#    define BACKTRACE_ENABLE 0
+#  endif /* __has_feature(address_sanitizer) */
+#elif defined(__SANITIZE_ADDRESS__) && !defined(BACKTRACE_ENABLE) /* GCC compiler */
+#  define BACKTRACE_ENABLE 0
+#endif
+
+#if !defined(BACKTRACE_ENABLE)
+/* automatic detector : backtrace enabled by default on linux+glibc and osx */
+#  if (defined(__linux__) && defined(__GLIBC__)) \
+     || (defined(__APPLE__) && defined(__MACH__))
+#    define BACKTRACE_ENABLE 1
+#  else
+#    define BACKTRACE_ENABLE 0
+#  endif
+#endif
+
+/* note : after this point, BACKTRACE_ENABLE is necessarily defined */
+
+
+#if BACKTRACE_ENABLE
+
+#include <execinfo.h>   /* backtrace, backtrace_symbols */
+
+#define MAX_STACK_FRAMES    50
+
+static void ABRThandler(int sig) {
+    const char* name;
+    void* addrlist[MAX_STACK_FRAMES];
+    char** symbollist;
+    U32 addrlen, i;
+
+    switch (sig) {
+        case SIGABRT: name = "SIGABRT"; break;
+        case SIGFPE: name = "SIGFPE"; break;
+        case SIGILL: name = "SIGILL"; break;
+        case SIGINT: name = "SIGINT"; break;
+        case SIGSEGV: name = "SIGSEGV"; break;
+        default: name = "UNKNOWN";
+    }
+
+    DISPLAY("Caught %s signal, printing stack:\n", name);
+    /* Retrieve current stack addresses. */
+    addrlen = backtrace(addrlist, MAX_STACK_FRAMES);
+    if (addrlen == 0) {
+        DISPLAY("\n");
+        return;
+    }
+    /* Create readable strings to each frame. */
+    symbollist = backtrace_symbols(addrlist, addrlen);
+    /* Print the stack trace, excluding calls handling the signal. */
+    for (i = ZSTD_START_SYMBOLLIST_FRAME; i < addrlen; i++) {
+        DISPLAY("%s\n", symbollist[i]);
+    }
+    free(symbollist);
+    /* Reset and raise the signal so default handler runs. */
+    signal(sig, SIG_DFL);
+    raise(sig);
+}
+#endif
+
+void FIO_addAbortHandler()
+{
+#if BACKTRACE_ENABLE
+    signal(SIGABRT, ABRThandler);
+    signal(SIGFPE, ABRThandler);
+    signal(SIGILL, ABRThandler);
+    signal(SIGSEGV, ABRThandler);
+    signal(SIGBUS, ABRThandler);
+#endif
+}
+
+
+/*-************************************************************
+* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && _MSC_VER >= 1400
 #   define LONG_SEEK _fseeki64
@@ -240,6 +301,26 @@ void FIO_setOverlapLog(unsigned overlapLog){
         DISPLAYLEVEL(2, "Setting overlapLog is useless in single-thread mode \n");
     g_overlapLog = overlapLog;
 }
+static U32 g_adaptiveMode = 0;
+void FIO_setAdaptiveMode(unsigned adapt) {
+    if ((adapt>0) && (g_nbWorkers==0))
+        EXM_THROW(1, "Adaptive mode is not compatible with single thread mode \n");
+    g_adaptiveMode = adapt;
+}
+static int g_minAdaptLevel = -50;   /* initializing this value requires a constant, so ZSTD_minCLevel() doesn't work */
+void FIO_setAdaptMin(int minCLevel)
+{
+#ifndef ZSTD_NOCOMPRESS
+    assert(minCLevel >= ZSTD_minCLevel());
+#endif
+    g_minAdaptLevel = minCLevel;
+}
+static int g_maxAdaptLevel = 22;   /* initializing this value requires a constant, so ZSTD_maxCLevel() doesn't work */
+void FIO_setAdaptMax(int maxCLevel)
+{
+    g_maxAdaptLevel = maxCLevel;
+}
+
 static U32 g_ldmFlag = 0;
 void FIO_setLdmFlag(unsigned ldmFlag) {
     g_ldmFlag = (ldmFlag>0);
@@ -418,7 +499,7 @@ typedef struct {
 
 static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
                                     U64 srcSize,
-                                    ZSTD_compressionParameters* comprParams) {
+                                    ZSTD_compressionParameters comprParams) {
     cRess_t ress;
     memset(&ress, 0, sizeof(ress));
 
@@ -439,6 +520,9 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
         if (dictFileName && (dictBuffer==NULL))
             EXM_THROW(32, "allocation error : can't create dictBuffer");
 
+        if (g_adaptiveMode && !g_ldmFlag && !comprParams.windowLog)
+            comprParams.windowLog = ADAPT_WINDOWLOG_DEFAULT;
+
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_contentSizeFlag, 1) );  /* always enable content size when available (note: supposed to be default) */
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_dictIDFlag, g_dictIDFlag) );
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_checksumFlag, g_checksumFlag) );
@@ -455,17 +539,24 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
             CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_ldmHashEveryLog, g_ldmHashEveryLog) );
         }
         /* compression parameters */
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_windowLog, comprParams->windowLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_chainLog, comprParams->chainLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_hashLog, comprParams->hashLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_searchLog, comprParams->searchLog) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_minMatch, comprParams->searchLength) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_targetLength, comprParams->targetLength) );
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionStrategy, (U32)comprParams->strategy) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_windowLog, comprParams.windowLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_chainLog, comprParams.chainLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_hashLog, comprParams.hashLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_searchLog, comprParams.searchLog) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_minMatch, comprParams.searchLength) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_targetLength, comprParams.targetLength) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionStrategy, (U32)comprParams.strategy) );
         /* multi-threading */
 #ifdef ZSTD_MULTITHREAD
         DISPLAYLEVEL(5,"set nb workers = %u \n", g_nbWorkers);
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_nbWorkers, g_nbWorkers) );
+        if ( (g_overlapLog == FIO_OVERLAP_LOG_NOTSET)
+          && (cLevel == ZSTD_maxCLevel()) )
+            g_overlapLog = 9;   /* full overlap */
+        if (g_overlapLog != FIO_OVERLAP_LOG_NOTSET) {
+            DISPLAYLEVEL(3,"set overlapLog = %u \n", g_overlapLog);
+            CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_overlapSizeLog, g_overlapLog) );
+        }
 #endif
         /* dictionary */
         CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) );  /* set the value temporarily for dictionary loading, to adapt compression parameters */
@@ -487,7 +578,8 @@ static void FIO_freeCResources(cRess_t ress)
 
 
 #ifdef ZSTD_GZCOMPRESS
-static unsigned long long FIO_compressGzFrame(cRess_t* ress,
+static unsigned long long
+FIO_compressGzFrame(cRess_t* ress,
                     const char* srcFileName, U64 const srcFileSize,
                     int compressionLevel, U64* readsize)
 {
@@ -569,9 +661,10 @@ static unsigned long long FIO_compressGzFrame(cRess_t* ress,
 
 
 #ifdef ZSTD_LZMACOMPRESS
-static unsigned long long FIO_compressLzmaFrame(cRess_t* ress,
-                            const char* srcFileName, U64 const srcFileSize,
-                            int compressionLevel, U64* readsize, int plain_lzma)
+static unsigned long long
+FIO_compressLzmaFrame(cRess_t* ress,
+                      const char* srcFileName, U64 const srcFileSize,
+                      int compressionLevel, U64* readsize, int plain_lzma)
 {
     unsigned long long inFileSize = 0, outFileSize = 0;
     lzma_stream strm = LZMA_STREAM_INIT;
@@ -644,9 +737,10 @@ static unsigned long long FIO_compressLzmaFrame(cRess_t* ress,
 #define LZ4F_max64KB max64KB
 #endif
 static int FIO_LZ4_GetBlockSize_FromBlockId (int id) { return (1 << (8 + (2 * id))); }
-static unsigned long long FIO_compressLz4Frame(cRess_t* ress,
-                            const char* srcFileName, U64 const srcFileSize,
-                            int compressionLevel, U64* readsize)
+static unsigned long long
+FIO_compressLz4Frame(cRess_t* ress,
+                     const char* srcFileName, U64 const srcFileSize,
+                     int compressionLevel, U64* readsize)
 {
     const size_t blockSize = FIO_LZ4_GetBlockSize_FromBlockId(LZ4F_max64KB);
     unsigned long long inFileSize = 0, outFileSize = 0;
@@ -734,11 +828,6 @@ static unsigned long long FIO_compressLz4Frame(cRess_t* ress,
 #endif
 
 
-/*! FIO_compressFilename_internal() :
- *  same as FIO_compressFilename_extRess(), with `ress.desFile` already opened.
- *  @return : 0 : compression completed correctly,
- *            1 : missing or pb opening srcFileName
- */
 static unsigned long long
 FIO_compressZstdFrame(const cRess_t* ressPtr,
                       const char* srcFileName, U64 fileSize,
@@ -749,16 +838,28 @@ FIO_compressZstdFrame(const cRess_t* ressPtr,
     FILE* const dstFile = ress.dstFile;
     U64 compressedfilesize = 0;
     ZSTD_EndDirective directive = ZSTD_e_continue;
+
+    /* stats */
+    ZSTD_frameProgression previous_zfp_update = { 0, 0, 0, 0, 0, 0 };
+    ZSTD_frameProgression previous_zfp_correction = { 0, 0, 0, 0, 0, 0 };
+    typedef enum { noChange, slower, faster } speedChange_e;
+    speedChange_e speedChange = noChange;
+    unsigned flushWaiting = 0;
+    unsigned inputPresented = 0;
+    unsigned inputBlocked = 0;
+    unsigned lastJobID = 0;
+
     DISPLAYLEVEL(6, "compression using zstd format \n");
 
     /* init */
-    if (fileSize != UTIL_FILESIZE_UNKNOWN)
-        ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize);
-    (void)compressionLevel; (void)srcFileName;
+    if (fileSize != UTIL_FILESIZE_UNKNOWN) {
+        CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize));
+    }
+    (void)srcFileName;
 
     /* Main compression loop */
     do {
-        size_t result;
+        size_t stillToFlush;
         /* Fill input Buffer */
         size_t const inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile);
         ZSTD_inBuffer inBuff = { ress.srcBuffer, inSize, 0 };
@@ -768,41 +869,149 @@ FIO_compressZstdFrame(const cRess_t* ressPtr,
         if ((inSize == 0) || (*readsize == fileSize))
             directive = ZSTD_e_end;
 
-        result = 1;
-        while (inBuff.pos != inBuff.size || (directive == ZSTD_e_end && result != 0)) {
+        stillToFlush = 1;
+        while ((inBuff.pos != inBuff.size)   /* input buffer must be entirely ingested */
+            || (directive == ZSTD_e_end && stillToFlush != 0) ) {
+
+            size_t const oldIPos = inBuff.pos;
             ZSTD_outBuffer outBuff = { ress.dstBuffer, ress.dstBufferSize, 0 };
-            CHECK_V(result, ZSTD_compress_generic(ress.cctx, &outBuff, &inBuff, directive));
+            size_t const toFlushNow = ZSTD_toFlushNow(ress.cctx);
+            CHECK_V(stillToFlush, ZSTD_compress_generic(ress.cctx, &outBuff, &inBuff, directive));
+
+            /* count stats */
+            inputPresented++;
+            if (oldIPos == inBuff.pos) inputBlocked++;  /* input buffer is full and can't take any more : input speed is faster than consumption rate */
+            if (!toFlushNow) flushWaiting = 1;
 
             /* Write compressed stream */
-            DISPLAYLEVEL(6, "ZSTD_compress_generic(end:%u) => intput pos(%u)<=(%u)size ; output generated %u bytes \n",
+            DISPLAYLEVEL(6, "ZSTD_compress_generic(end:%u) => input pos(%u)<=(%u)size ; output generated %u bytes \n",
                             (U32)directive, (U32)inBuff.pos, (U32)inBuff.size, (U32)outBuff.pos);
             if (outBuff.pos) {
                 size_t const sizeCheck = fwrite(ress.dstBuffer, 1, outBuff.pos, dstFile);
-                if (sizeCheck!=outBuff.pos)
+                if (sizeCheck != outBuff.pos)
                     EXM_THROW(25, "Write error : cannot write compressed block");
                 compressedfilesize += outBuff.pos;
             }
+
+            /* display notification; and adapt compression level */
             if (READY_FOR_UPDATE()) {
                 ZSTD_frameProgression const zfp = ZSTD_getFrameProgression(ress.cctx);
                 double const cShare = (double)zfp.produced / (zfp.consumed + !zfp.consumed/*avoid div0*/) * 100;
+
+                /* display progress notifications */
                 if (g_displayLevel >= 3) {
-                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%4u MB - Consumed :%4u MB - Compressed :%4u MB => %.2f%%",
+                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%4u MB - Consumed :%4u MB - Compressed :%4u MB => %.2f%% ",
                                 compressionLevel,
                                 (U32)((zfp.ingested - zfp.consumed) >> 20),
                                 (U32)(zfp.consumed >> 20),
                                 (U32)(zfp.produced >> 20),
                                 cShare );
-                } else {   /* g_displayLevel == 2 */
+                } else {   /* summarized notifications if == 2; */
                     DISPLAYLEVEL(2, "\rRead : %u ", (U32)(zfp.consumed >> 20));
                     if (fileSize != UTIL_FILESIZE_UNKNOWN)
                         DISPLAYLEVEL(2, "/ %u ", (U32)(fileSize >> 20));
                     DISPLAYLEVEL(2, "MB ==> %2.f%% ", cShare);
                     DELAY_NEXT_UPDATE();
                 }
-            }
-        }
+
+                /* adaptive mode : statistics measurement and speed correction */
+                if (g_adaptiveMode) {
+
+                    /* check output speed */
+                    if (zfp.currentJobID > 1) {  /* only possible if nbWorkers >= 1 */
+
+                        unsigned long long newlyProduced = zfp.produced - previous_zfp_update.produced;
+                        unsigned long long newlyFlushed = zfp.flushed - previous_zfp_update.flushed;
+                        assert(zfp.produced >= previous_zfp_update.produced);
+                        assert(g_nbWorkers >= 1);
+
+                        /* test if compression is blocked
+                         * either because output is slow and all buffers are full
+                         * or because input is slow and no job can start while waiting for at least one buffer to be filled.
+                         * note : excluse starting part, since currentJobID > 1 */
+                        if ( (zfp.consumed == previous_zfp_update.consumed)   /* no data compressed : no data available, or no more buffer to compress to, OR compression is really slow (compression of a single block is slower than update rate)*/
+                          && (zfp.nbActiveWorkers == 0)                       /* confirmed : no compression ongoing */
+                          ) {
+                            DISPLAYLEVEL(6, "all buffers full : compression stopped => slow down \n")
+                            speedChange = slower;
+                        }
+
+                        previous_zfp_update = zfp;
+
+                        if ( (newlyProduced > (newlyFlushed * 9 / 8))   /* compression produces more data than output can flush (though production can be spiky, due to work unit : (N==4)*block sizes) */
+                          && (flushWaiting == 0)                        /* flush speed was never slowed by lack of production, so it's operating at max capacity */
+                          ) {
+                            DISPLAYLEVEL(6, "compression faster than flush (%llu > %llu), and flushed was never slowed down by lack of production => slow down \n", newlyProduced, newlyFlushed);
+                            speedChange = slower;
+                        }
+                        flushWaiting = 0;
+                    }
+
+                    /* course correct only if there is at least one new job completed */
+                    if (zfp.currentJobID > lastJobID) {
+                        DISPLAYLEVEL(6, "compression level adaptation check \n")
+
+                        /* check input speed */
+                        if (zfp.currentJobID > g_nbWorkers+1) {   /* warm up period, to fill all workers */
+                            if (inputBlocked <= 0) {
+                                DISPLAYLEVEL(6, "input is never blocked => input is slower than ingestion \n");
+                                speedChange = slower;
+                            } else if (speedChange == noChange) {
+                                unsigned long long newlyIngested = zfp.ingested - previous_zfp_correction.ingested;
+                                unsigned long long newlyConsumed = zfp.consumed - previous_zfp_correction.consumed;
+                                unsigned long long newlyProduced = zfp.produced - previous_zfp_correction.produced;
+                                unsigned long long newlyFlushed  = zfp.flushed  - previous_zfp_correction.flushed;
+                                previous_zfp_correction = zfp;
+                                assert(inputPresented > 0);
+                                DISPLAYLEVEL(6, "input blocked %u/%u(%.2f) - ingested:%u vs %u:consumed - flushed:%u vs %u:produced \n",
+                                                inputBlocked, inputPresented, (double)inputBlocked/inputPresented*100,
+                                                (U32)newlyIngested, (U32)newlyConsumed,
+                                                (U32)newlyFlushed, (U32)newlyProduced);
+                                if ( (inputBlocked > inputPresented / 8)     /* input is waiting often, because input buffers is full : compression or output too slow */
+                                  && (newlyFlushed * 33 / 32 > newlyProduced)  /* flush everything that is produced */
+                                  && (newlyIngested * 33 / 32 > newlyConsumed) /* input speed as fast or faster than compression speed */
+                                ) {
+                                    DISPLAYLEVEL(6, "recommend faster as in(%llu) >= (%llu)comp(%llu) <= out(%llu) \n",
+                                                    newlyIngested, newlyConsumed, newlyProduced, newlyFlushed);
+                                    speedChange = faster;
+                                }
+                            }
+                            inputBlocked = 0;
+                            inputPresented = 0;
+                        }
+
+                        if (speedChange == slower) {
+                            DISPLAYLEVEL(6, "slower speed , higher compression \n")
+                            compressionLevel ++;
+                            if (compressionLevel > ZSTD_maxCLevel()) compressionLevel = ZSTD_maxCLevel();
+                            if (compressionLevel > g_maxAdaptLevel) compressionLevel = g_maxAdaptLevel;
+                            compressionLevel += (compressionLevel == 0);   /* skip 0 */
+                            ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionLevel, (unsigned)compressionLevel);
+                        }
+                        if (speedChange == faster) {
+                            DISPLAYLEVEL(6, "faster speed , lighter compression \n")
+                            compressionLevel --;
+                            if (compressionLevel < g_minAdaptLevel) compressionLevel = g_minAdaptLevel;
+                            compressionLevel -= (compressionLevel == 0);   /* skip 0 */
+                            ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionLevel, (unsigned)compressionLevel);
+                        }
+                        speedChange = noChange;
+
+                        lastJobID = zfp.currentJobID;
+                    }  /* if (zfp.currentJobID > lastJobID) */
+                }  /* if (g_adaptiveMode) */
+            }  /* if (READY_FOR_UPDATE()) */
+        }  /* while ((inBuff.pos != inBuff.size) */
     } while (directive != ZSTD_e_end);
 
+    if (ferror(srcFile)) {
+        EXM_THROW(26, "Read error : I/O error");
+    }
+    if (fileSize != UTIL_FILESIZE_UNKNOWN && *readsize != fileSize) {
+        EXM_THROW(27, "Read error : Incomplete read : %llu / %llu B",
+                (unsigned long long)*readsize, (unsigned long long)fileSize);
+    }
+
     return compressedfilesize;
 }
 
@@ -872,14 +1081,80 @@ FIO_compressFilename_internal(cRess_t ress,
 }
 
 
+/*! FIO_compressFilename_dstFile() :
+ *  open dstFileName, or pass-through if ress.dstFile != NULL,
+ *  then start compression with FIO_compressFilename_internal().
+ *  Manages source removal (--rm) and file permissions transfer.
+ *  note : ress.srcFile must be != NULL,
+ *  so reach this function through FIO_compressFilename_srcFile().
+ *  @return : 0 : compression completed correctly,
+ *            1 : pb
+ */
+static int FIO_compressFilename_dstFile(cRess_t ress,
+                                        const char* dstFileName,
+                                        const char* srcFileName,
+                                        int compressionLevel)
+{
+    int closeDstFile = 0;
+    int result;
+    stat_t statbuf;
+    int transfer_permissions = 0;
+
+    assert(ress.srcFile != NULL);
+
+    if (ress.dstFile == NULL) {
+        closeDstFile = 1;
+        DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s", dstFileName);
+        ress.dstFile = FIO_openDstFile(dstFileName);
+        if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
+        /* Must only be added after FIO_openDstFile() succeeds.
+         * Otherwise we may delete the destination file if it already exists,
+         * and the user presses Ctrl-C when asked if they wish to overwrite.
+         */
+        addHandler(dstFileName);
+
+        if ( strcmp (srcFileName, stdinmark)
+          && UTIL_getFileStat(srcFileName, &statbuf))
+            transfer_permissions = 1;
+    }
+
+    result = FIO_compressFilename_internal(ress, dstFileName, srcFileName, compressionLevel);
+
+    if (closeDstFile) {
+        FILE* const dstFile = ress.dstFile;
+        ress.dstFile = NULL;
+
+        clearHandler();
+
+        if (fclose(dstFile)) { /* error closing dstFile */
+            DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
+            result=1;
+        }
+        if ( (result != 0)  /* operation failure */
+          && strcmp(dstFileName, nulmark)     /* special case : don't remove() /dev/null */
+          && strcmp(dstFileName, stdoutmark)  /* special case : don't remove() stdout */
+          ) {
+            FIO_remove(dstFileName); /* remove compression artefact; note don't do anything special if remove() fails */
+        } else if ( strcmp(dstFileName, stdoutmark)
+                 && strcmp(dstFileName, nulmark)
+                 && transfer_permissions) {
+            UTIL_setFileStat(dstFileName, &statbuf);
+        }
+    }
+
+    return result;
+}
+
+
 /*! FIO_compressFilename_srcFile() :
- *  note : ress.destFile already opened
  *  @return : 0 : compression completed correctly,
  *            1 : missing or pb opening srcFileName
  */
-static int FIO_compressFilename_srcFile(cRess_t ress,
-                            const char* dstFileName, const char* srcFileName,
-                            int compressionLevel)
+static int
+FIO_compressFilename_srcFile(cRess_t ress,
+                             const char* dstFileName,
+                             const char* srcFileName,
+                             int compressionLevel)
 {
     int result;
 
@@ -890,12 +1165,16 @@ static int FIO_compressFilename_srcFile(cRess_t ress,
     }
 
     ress.srcFile = FIO_openSrcFile(srcFileName);
-    if (!ress.srcFile) return 1;   /* srcFile could not be opened */
+    if (ress.srcFile == NULL) return 1;   /* srcFile could not be opened */
 
-    result = FIO_compressFilename_internal(ress, dstFileName, srcFileName, compressionLevel);
+    result = FIO_compressFilename_dstFile(ress, dstFileName, srcFileName, compressionLevel);
 
     fclose(ress.srcFile);
-    if (g_removeSrcFile /* --rm */ && !result && strcmp(srcFileName, stdinmark)) {
+    ress.srcFile = NULL;
+    if ( g_removeSrcFile   /* --rm */
+      && result == 0       /* success */
+      && strcmp(srcFileName, stdinmark)   /* exception : don't erase stdin */
+      ) {
         /* We must clear the handler, since after this point calling it would
          * delete both the source and destination files.
          */
@@ -907,59 +1186,16 @@ static int FIO_compressFilename_srcFile(cRess_t ress,
 }
 
 
-/*! FIO_compressFilename_dstFile() :
- *  @return : 0 : compression completed correctly,
- *            1 : pb
- */
-static int FIO_compressFilename_dstFile(cRess_t ress,
-                                        const char* dstFileName,
-                                        const char* srcFileName,
-                                        int compressionLevel)
-{
-    int result;
-    stat_t statbuf;
-    int stat_result = 0;
-
-    DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s", dstFileName);
-    ress.dstFile = FIO_openDstFile(dstFileName);
-    if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
-    /* Must ony be added after FIO_openDstFile() succeeds.
-     * Otherwise we may delete the destination file if at already exists, and
-     * the user presses Ctrl-C when asked if they wish to overwrite.
-     */
-    addHandler(dstFileName);
-
-    if (strcmp (srcFileName, stdinmark) && UTIL_getFileStat(srcFileName, &statbuf))
-        stat_result = 1;
-    result = FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, compressionLevel);
-    clearHandler();
-
-    if (fclose(ress.dstFile)) { /* error closing dstFile */
-        DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
-        result=1;
-    }
-    if ( (result != 0)  /* operation failure */
-      && strcmp(dstFileName, nulmark)      /* special case : don't remove() /dev/null */
-      && strcmp(dstFileName, stdoutmark) ) /* special case : don't remove() stdout */
-        FIO_remove(dstFileName); /* remove compression artefact; note don't do anything special if remove() fails */
-    else if ( strcmp(dstFileName, stdoutmark)
-           && strcmp(dstFileName, nulmark)
-           && stat_result)
-        UTIL_setFileStat(dstFileName, &statbuf);
-
-    return result;
-}
-
-
 int FIO_compressFilename(const char* dstFileName, const char* srcFileName,
-                         const char* dictFileName, int compressionLevel, ZSTD_compressionParameters* comprParams)
+                         const char* dictFileName, int compressionLevel,
+                         ZSTD_compressionParameters comprParams)
 {
     clock_t const start = clock();
     U64 const fileSize = UTIL_getFileSize(srcFileName);
     U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : fileSize;
 
     cRess_t const ress = FIO_createCResources(dictFileName, compressionLevel, srcSize, comprParams);
-    int const result = FIO_compressFilename_dstFile(ress, dstFileName, srcFileName, compressionLevel);
+    int const result = FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, compressionLevel);
 
     double const seconds = (double)(clock() - start) / CLOCKS_PER_SEC;
     DISPLAYLEVEL(4, "Completed in %.2f sec \n", seconds);
@@ -969,57 +1205,77 @@ int FIO_compressFilename(const char* dstFileName, const char* srcFileName,
 }
 
 
+/* FIO_determineCompressedName() :
+ * create a destination filename for compressed srcFileName.
+ * @return a pointer to it.
+ * This function never returns an error (it may abort() in case of pb)
+ */
+static const char*
+FIO_determineCompressedName(const char* srcFileName, const char* suffix)
+{
+    static size_t dfnbCapacity = 0;
+    static char* dstFileNameBuffer = NULL;   /* using static allocation : this function cannot be multi-threaded */
+
+    size_t const sfnSize = strlen(srcFileName);
+    size_t const suffixSize = strlen(suffix);
+
+    if (dfnbCapacity <= sfnSize+suffixSize+1) {
+        /* resize buffer for dstName */
+        free(dstFileNameBuffer);
+        dfnbCapacity = sfnSize + suffixSize + 30;
+        dstFileNameBuffer = (char*)malloc(dfnbCapacity);
+        if (!dstFileNameBuffer) {
+            EXM_THROW(30, "zstd: %s", strerror(errno));
+    }   }
+    assert(dstFileNameBuffer != NULL);
+    memcpy(dstFileNameBuffer, srcFileName, sfnSize);
+    memcpy(dstFileNameBuffer+sfnSize, suffix, suffixSize+1 /* Include terminating null */);
+
+    return dstFileNameBuffer;
+}
+
+
+/* FIO_compressMultipleFilenames() :
+ * compress nbFiles files
+ * into one destination (outFileName)
+ * or into one file each (outFileName == NULL, but suffix != NULL).
+ */
 int FIO_compressMultipleFilenames(const char** inFileNamesTable, unsigned nbFiles,
                                   const char* outFileName, const char* suffix,
                                   const char* dictFileName, int compressionLevel,
-                                  ZSTD_compressionParameters* comprParams)
+                                  ZSTD_compressionParameters comprParams)
 {
-    int missed_files = 0;
-    size_t dfnSize = FNSPACE;
-    char*  dstFileName = (char*)malloc(FNSPACE);
-    size_t const suffixSize = suffix ? strlen(suffix) : 0;
+    int error = 0;
     U64 const firstFileSize = UTIL_getFileSize(inFileNamesTable[0]);
     U64 const firstSrcSize = (firstFileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : firstFileSize;
     U64 const srcSize = (nbFiles != 1) ? ZSTD_CONTENTSIZE_UNKNOWN : firstSrcSize ;
     cRess_t ress = FIO_createCResources(dictFileName, compressionLevel, srcSize, comprParams);
 
     /* init */
-    if (dstFileName==NULL)
-        EXM_THROW(27, "FIO_compressMultipleFilenames : allocation error for dstFileName");
-    if (outFileName == NULL && suffix == NULL)
-        EXM_THROW(28, "FIO_compressMultipleFilenames : dst unknown");  /* should never happen */
+    assert(outFileName != NULL || suffix != NULL);
 
-    /* loop on each file */
-    if (outFileName != NULL) {
-        unsigned u;
+    if (outFileName != NULL) {   /* output into a single destination (stdout typically) */
         ress.dstFile = FIO_openDstFile(outFileName);
-        if (ress.dstFile==NULL) {  /* could not open outFileName */
-            missed_files = nbFiles;
+        if (ress.dstFile == NULL) {  /* could not open outFileName */
+            error = 1;
         } else {
+            unsigned u;
             for (u=0; u<nbFiles; u++)
-                missed_files += FIO_compressFilename_srcFile(ress, outFileName, inFileNamesTable[u], compressionLevel);
+                error |= FIO_compressFilename_srcFile(ress, outFileName, inFileNamesTable[u], compressionLevel);
             if (fclose(ress.dstFile))
-                EXM_THROW(29, "Write error : cannot properly close stdout");
+                EXM_THROW(29, "Write error : cannot properly close %s", outFileName);
+            ress.dstFile = NULL;
         }
     } else {
         unsigned u;
         for (u=0; u<nbFiles; u++) {
-            size_t const ifnSize = strlen(inFileNamesTable[u]);
-            if (dfnSize <= ifnSize+suffixSize+1) {  /* resize name buffer */
-                free(dstFileName);
-                dfnSize = ifnSize + 20;
-                dstFileName = (char*)malloc(dfnSize);
-                if (!dstFileName) {
-                    EXM_THROW(30, "zstd: %s", strerror(errno));
-            }   }
-            strcpy(dstFileName, inFileNamesTable[u]);
-            strcat(dstFileName, suffix);
-            missed_files += FIO_compressFilename_dstFile(ress, dstFileName, inFileNamesTable[u], compressionLevel);
+            const char* const srcFileName = inFileNamesTable[u];
+            const char* const dstFileName = FIO_determineCompressedName(srcFileName, suffix);  /* cannot fail */
+            error |= FIO_compressFilename_srcFile(ress, dstFileName, srcFileName, compressionLevel);
     }   }
 
     FIO_freeCResources(ress);
-    free(dstFileName);
-    return missed_files;
+    return error;
 }
 
 #endif /* #ifndef ZSTD_NOCOMPRESS */
@@ -1208,12 +1464,12 @@ static void FIO_zstdErrorHelp(dRess_t* ress, size_t err, char const* srcFileName
     if (err == 0) {
         unsigned long long const windowSize = header.windowSize;
         U32 const windowLog = FIO_highbit64(windowSize) + ((windowSize & (windowSize - 1)) != 0);
-        U32 const windowMB = (U32)((windowSize >> 20) + ((windowSize & ((1 MB) - 1)) != 0));
-        assert(windowSize < (U64)(1ULL << 52));
         assert(g_memLimit > 0);
         DISPLAYLEVEL(1, "%s : Window size larger than maximum : %llu > %u\n",
                         srcFileName, windowSize, g_memLimit);
         if (windowLog <= ZSTD_WINDOWLOG_MAX) {
+            U32 const windowMB = (U32)((windowSize >> 20) + ((windowSize & ((1 MB) - 1)) != 0));
+            assert(windowSize < (U64)(1ULL << 52));   /* ensure now overflow for windowMB */
             DISPLAYLEVEL(1, "%s : Use --long=%u or --memory=%uMB\n",
                             srcFileName, windowLog, windowMB);
             return;
@@ -1227,7 +1483,7 @@ static void FIO_zstdErrorHelp(dRess_t* ress, size_t err, char const* srcFileName
  *  @return : size of decoded zstd frame, or an error code
 */
 #define FIO_ERROR_FRAME_DECODING   ((unsigned long long)(-2))
-unsigned long long FIO_decompressZstdFrame(dRess_t* ress,
+static unsigned long long FIO_decompressZstdFrame(dRess_t* ress,
                                        FILE* finput,
                                        const char* srcFileName,
                                        U64 alreadyDecoded)
@@ -1480,7 +1736,7 @@ static unsigned long long FIO_decompressLz4Frame(dRess_t* ress,
             if (LZ4F_isError(nextToLoad)) {
                 DISPLAYLEVEL(1, "zstd: %s: lz4 decompression error : %s \n",
                                 srcFileName, LZ4F_getErrorName(nextToLoad));
-                decodingError = 1; break;
+                decodingError = 1; nextToLoad = 0; break;
             }
             pos += remaining;
 
@@ -1488,7 +1744,7 @@ static unsigned long long FIO_decompressLz4Frame(dRess_t* ress,
             if (decodedBytes) {
                 if (fwrite(ress->dstBuffer, 1, decodedBytes, ress->dstFile) != decodedBytes) {
                     DISPLAYLEVEL(1, "zstd: %s \n", strerror(errno));
-                    decodingError = 1; break;
+                    decodingError = 1; nextToLoad = 0; break;
                 }
                 filesize += decodedBytes;
                 DISPLAYUPDATE(2, "\rDecompressed : %u MB  ", (unsigned)(filesize>>20));
@@ -1597,11 +1853,71 @@ static int FIO_decompressFrames(dRess_t ress, FILE* srcFile,
     return 0;
 }
 
+/** FIO_decompressDstFile() :
+    open `dstFileName`,
+    or path-through if ress.dstFile is already != 0,
+    then start decompression process (FIO_decompressFrames()).
+    @return : 0 : OK
+              1 : operation aborted
+*/
+static int FIO_decompressDstFile(dRess_t ress, FILE* srcFile,
+                                 const char* dstFileName, const char* srcFileName)
+{
+    int result;
+    stat_t statbuf;
+    int transfer_permissions = 0;
+    int releaseDstFile = 0;
+
+    if (ress.dstFile == NULL) {
+        releaseDstFile = 1;
+
+        ress.dstFile = FIO_openDstFile(dstFileName);
+        if (ress.dstFile==0) return 1;
+
+        /* Must only be added after FIO_openDstFile() succeeds.
+         * Otherwise we may delete the destination file if it already exists,
+         * and the user presses Ctrl-C when asked if they wish to overwrite.
+         */
+        addHandler(dstFileName);
+
+        if ( strcmp(srcFileName, stdinmark)   /* special case : don't transfer permissions from stdin */
+          && UTIL_getFileStat(srcFileName, &statbuf) )
+            transfer_permissions = 1;
+    }
+
+
+    result = FIO_decompressFrames(ress, srcFile, dstFileName, srcFileName);
+
+    if (releaseDstFile) {
+        FILE* const dstFile = ress.dstFile;
+        clearHandler();
+        ress.dstFile = NULL;
+        if (fclose(dstFile)) {
+            DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
+            result = 1;
+        }
+
+        if ( (result != 0)  /* operation failure */
+          && strcmp(dstFileName, nulmark)     /* special case : don't remove() /dev/null (#316) */
+          && strcmp(dstFileName, stdoutmark)  /* special case : don't remove() stdout */
+          ) {
+            FIO_remove(dstFileName);  /* remove decompression artefact; note: don't do anything special if remove() fails */
+        } else {  /* operation success */
+            if ( strcmp(dstFileName, stdoutmark) /* special case : don't chmod stdout */
+              && strcmp(dstFileName, nulmark)    /* special case : don't chmod /dev/null */
+              && transfer_permissions )          /* file permissions correctly extracted from src */
+                UTIL_setFileStat(dstFileName, &statbuf);  /* transfer file permissions from src into dst */
+        }
+    }
+
+    return result;
+}
+
 
 /** FIO_decompressSrcFile() :
-    Decompression `srcFileName` into `ress.dstFile`
+    Open `srcFileName`, transfer control to decompressDstFile()
     @return : 0 : OK
-              1 : operation not started
+              1 : error
 */
 static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const char* srcFileName)
 {
@@ -1615,16 +1931,17 @@ static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const ch
 
     srcFile = FIO_openSrcFile(srcFileName);
     if (srcFile==NULL) return 1;
+    ress.srcBufferLoaded = 0;
 
-    result = FIO_decompressFrames(ress, srcFile, dstFileName, srcFileName);
+    result = FIO_decompressDstFile(ress, srcFile, dstFileName, srcFileName);
 
     /* Close file */
     if (fclose(srcFile)) {
         DISPLAYLEVEL(1, "zstd: %s: %s \n", srcFileName, strerror(errno));  /* error should not happen */
         return 1;
     }
-    if ( g_removeSrcFile /* --rm */
-      && (result==0)     /* decompression successful */
+    if ( g_removeSrcFile  /* --rm */
+      && (result==0)      /* decompression successful */
       && strcmp(srcFileName, stdinmark) ) /* not stdin */ {
         /* We must clear the handler, since after this point calling it would
          * delete both the source and destination files.
@@ -1639,73 +1956,94 @@ static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const ch
 }
 
 
-/** FIO_decompressFile_extRess() :
-    decompress `srcFileName` into `dstFileName`
-    @return : 0 : OK
-              1 : operation aborted (src not available, dst already taken, etc.)
-*/
-static int FIO_decompressDstFile(dRess_t ress,
-                                 const char* dstFileName, const char* srcFileName)
-{
-    int result;
-    stat_t statbuf;
-    int stat_result = 0;
-
-    ress.dstFile = FIO_openDstFile(dstFileName);
-    if (ress.dstFile==0) return 1;
-    /* Must ony be added after FIO_openDstFile() succeeds.
-     * Otherwise we may delete the destination file if at already exists, and
-     * the user presses Ctrl-C when asked if they wish to overwrite.
-     */
-    addHandler(dstFileName);
-
-    if ( strcmp(srcFileName, stdinmark)
-      && UTIL_getFileStat(srcFileName, &statbuf) )
-        stat_result = 1;
-    result = FIO_decompressSrcFile(ress, dstFileName, srcFileName);
-    clearHandler();
-
-    if (fclose(ress.dstFile)) {
-        DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
-        result = 1;
-    }
-
-    if ( (result != 0)  /* operation failure */
-      && strcmp(dstFileName, nulmark)      /* special case : don't remove() /dev/null (#316) */
-      && strcmp(dstFileName, stdoutmark) ) /* special case : don't remove() stdout */
-        FIO_remove(dstFileName);  /* remove decompression artefact; note don't do anything special if remove() fails */
-    else {  /* operation success */
-        if ( strcmp(dstFileName, stdoutmark) /* special case : don't chmod stdout */
-          && strcmp(dstFileName, nulmark)    /* special case : don't chmod /dev/null */
-          && stat_result )                   /* file permissions correctly extracted from src */
-            UTIL_setFileStat(dstFileName, &statbuf);  /* transfer file permissions from src into dst */
-    }
-
-    signal(SIGINT, SIG_DFL);
-
-    return result;
-}
-
 
 int FIO_decompressFilename(const char* dstFileName, const char* srcFileName,
                            const char* dictFileName)
 {
     dRess_t const ress = FIO_createDResources(dictFileName);
 
-    int const decodingError = FIO_decompressDstFile(ress, dstFileName, srcFileName);
+    int const decodingError = FIO_decompressSrcFile(ress, dstFileName, srcFileName);
 
     FIO_freeDResources(ress);
     return decodingError;
 }
 
 
-#define MAXSUFFIXSIZE 8
-int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles,
-                                    const char* outFileName,
-                                    const char* dictFileName)
+/* FIO_determineDstName() :
+ * create a destination filename from a srcFileName.
+ * @return a pointer to it.
+ * @return == NULL if there is an error */
+static const char*
+FIO_determineDstName(const char* srcFileName)
 {
-    int skippedFiles = 0;
-    int missingFiles = 0;
+    static size_t dfnbCapacity = 0;
+    static char* dstFileNameBuffer = NULL;   /* using static allocation : this function cannot be multi-threaded */
+
+    size_t const sfnSize = strlen(srcFileName);
+    size_t suffixSize;
+    const char* const suffixPtr = strrchr(srcFileName, '.');
+    if (suffixPtr == NULL) {
+        DISPLAYLEVEL(1, "zstd: %s: unknown suffix -- ignored \n",
+                        srcFileName);
+        return NULL;
+    }
+    suffixSize = strlen(suffixPtr);
+
+    /* check suffix is authorized */
+    if (sfnSize <= suffixSize
+        || (   strcmp(suffixPtr, ZSTD_EXTENSION)
+        #ifdef ZSTD_GZDECOMPRESS
+            && strcmp(suffixPtr, GZ_EXTENSION)
+        #endif
+        #ifdef ZSTD_LZMADECOMPRESS
+            && strcmp(suffixPtr, XZ_EXTENSION)
+            && strcmp(suffixPtr, LZMA_EXTENSION)
+        #endif
+        #ifdef ZSTD_LZ4DECOMPRESS
+            && strcmp(suffixPtr, LZ4_EXTENSION)
+        #endif
+            ) ) {
+        const char* suffixlist = ZSTD_EXTENSION
+        #ifdef ZSTD_GZDECOMPRESS
+            "/" GZ_EXTENSION
+        #endif
+        #ifdef ZSTD_LZMADECOMPRESS
+            "/" XZ_EXTENSION "/" LZMA_EXTENSION
+        #endif
+        #ifdef ZSTD_LZ4DECOMPRESS
+            "/" LZ4_EXTENSION
+        #endif
+        ;
+        DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%s expected) -- ignored \n",
+                     srcFileName, suffixlist);
+        return NULL;
+    }
+
+    /* allocate enough space to write dstFilename into it */
+    if (dfnbCapacity+suffixSize <= sfnSize+1) {
+        free(dstFileNameBuffer);
+        dfnbCapacity = sfnSize + 20;
+        dstFileNameBuffer = (char*)malloc(dfnbCapacity);
+        if (dstFileNameBuffer==NULL)
+            EXM_THROW(74, "not enough memory for dstFileName");
+    }
+
+    /* return dst name == src name truncated from suffix */
+    assert(dstFileNameBuffer != NULL);
+    memcpy(dstFileNameBuffer, srcFileName, sfnSize - suffixSize);
+    dstFileNameBuffer[sfnSize-suffixSize] = '\0';
+    return dstFileNameBuffer;
+
+    /* note : dstFileNameBuffer memory is not going to be free */
+}
+
+
+int
+FIO_decompressMultipleFilenames(const char* srcNamesTable[], unsigned nbFiles,
+                                const char* outFileName,
+                                const char* dictFileName)
+{
+    int error = 0;
     dRess_t ress = FIO_createDResources(dictFileName);
 
     if (outFileName) {
@@ -1713,55 +2051,22 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
         ress.dstFile = FIO_openDstFile(outFileName);
         if (ress.dstFile == 0) EXM_THROW(71, "cannot open %s", outFileName);
         for (u=0; u<nbFiles; u++)
-            missingFiles += FIO_decompressSrcFile(ress, outFileName, srcNamesTable[u]);
+            error |= FIO_decompressSrcFile(ress, outFileName, srcNamesTable[u]);
         if (fclose(ress.dstFile))
             EXM_THROW(72, "Write error : cannot properly close output file");
     } else {
-        size_t suffixSize;
-        size_t dfnSize = FNSPACE;
         unsigned u;
-        char* dstFileName = (char*)malloc(FNSPACE);
-        if (dstFileName==NULL)
-            EXM_THROW(73, "not enough memory for dstFileName");
         for (u=0; u<nbFiles; u++) {   /* create dstFileName */
             const char* const srcFileName = srcNamesTable[u];
-            const char* const suffixPtr = strrchr(srcFileName, '.');
-            size_t const sfnSize = strlen(srcFileName);
-            if (!suffixPtr) {
-                DISPLAYLEVEL(1, "zstd: %s: unknown suffix -- ignored \n",
-                                srcFileName);
-                skippedFiles++;
-                continue;
-            }
-            suffixSize = strlen(suffixPtr);
-            if (dfnSize+suffixSize <= sfnSize+1) {
-                free(dstFileName);
-                dfnSize = sfnSize + 20;
-                dstFileName = (char*)malloc(dfnSize);
-                if (dstFileName==NULL)
-                    EXM_THROW(74, "not enough memory for dstFileName");
-            }
-            if (sfnSize <= suffixSize
-                || (strcmp(suffixPtr, GZ_EXTENSION)
-                    && strcmp(suffixPtr, XZ_EXTENSION)
-                    && strcmp(suffixPtr, ZSTD_EXTENSION)
-                    && strcmp(suffixPtr, LZMA_EXTENSION)
-                    && strcmp(suffixPtr, LZ4_EXTENSION)) ) {
-                DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%s/%s/%s/%s/%s expected) -- ignored \n",
-                             srcFileName, GZ_EXTENSION, XZ_EXTENSION, ZSTD_EXTENSION, LZMA_EXTENSION, LZ4_EXTENSION);
-                skippedFiles++;
-                continue;
-            } else {
-                memcpy(dstFileName, srcFileName, sfnSize - suffixSize);
-                dstFileName[sfnSize-suffixSize] = '\0';
-            }
-            missingFiles += FIO_decompressDstFile(ress, dstFileName, srcFileName);
+            const char* const dstFileName = FIO_determineDstName(srcFileName);
+            if (dstFileName == NULL) { error=1; continue; }
+
+            error |= FIO_decompressSrcFile(ress, dstFileName, srcFileName);
         }
-        free(dstFileName);
     }
 
     FIO_freeDResources(ress);
-    return missingFiles + skippedFiles;
+    return error;
 }
 
 
@@ -1781,22 +2086,19 @@ typedef struct {
     U32 nbFiles;
 } fileInfo_t;
 
-/** getFileInfo() :
- *  Reads information from file, stores in *info
- * @return : 0 if successful
- *           1 for frame analysis error
- *           2 for file not compressed with zstd
- *           3 for cases in which file could not be opened.
- */
-static int getFileInfo_fileConfirmed(fileInfo_t* info, const char* inFileName){
-    int detectError = 0;
-    FILE* const srcFile = FIO_openSrcFile(inFileName);
-    if (srcFile == NULL) {
-        DISPLAY("Error: could not open source file %s\n", inFileName);
-        return 3;
-    }
-    info->compressedSize = UTIL_getFileSize(inFileName);
+typedef enum { info_success=0, info_frame_error=1, info_not_zstd=2, info_file_error=3 } InfoError;
 
+#define ERROR_IF(c,n,...) {             \
+    if (c) {                           \
+        DISPLAYLEVEL(1, __VA_ARGS__);  \
+        DISPLAYLEVEL(1, " \n");        \
+        return n;                      \
+    }                                  \
+}
+
+static InfoError
+FIO_analyzeFrames(fileInfo_t* info, FILE* const srcFile)
+{
     /* begin analyzing frame */
     for ( ; ; ) {
         BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
@@ -1806,130 +2108,111 @@ static int getFileInfo_fileConfirmed(fileInfo_t* info, const char* inFileName){
               && (numBytesRead == 0)
               && (info->compressedSize > 0)
               && (info->compressedSize != UTIL_FILESIZE_UNKNOWN) ) {
-                break;
-            }
-            else if (feof(srcFile)) {
-                DISPLAY("Error: reached end of file with incomplete frame\n");
-                detectError = 2;
-                break;
-            }
-            else {
-                DISPLAY("Error: did not reach end of file but ran out of frames\n");
-                detectError = 1;
-                break;
+                break;  /* correct end of file => success */
             }
+            ERROR_IF(feof(srcFile), info_not_zstd, "Error: reached end of file with incomplete frame");
+            ERROR_IF(1, info_frame_error, "Error: did not reach end of file but ran out of frames");
         }
         {   U32 const magicNumber = MEM_readLE32(headerBuffer);
             /* Zstandard frame */
             if (magicNumber == ZSTD_MAGICNUMBER) {
                 ZSTD_frameHeader header;
                 U64 const frameContentSize = ZSTD_getFrameContentSize(headerBuffer, numBytesRead);
-                if (frameContentSize == ZSTD_CONTENTSIZE_ERROR || frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN) {
+                if ( frameContentSize == ZSTD_CONTENTSIZE_ERROR
+                  || frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN ) {
                     info->decompUnavailable = 1;
                 } else {
                     info->decompressedSize += frameContentSize;
                 }
-                if (ZSTD_getFrameHeader(&header, headerBuffer, numBytesRead) != 0) {
-                    DISPLAY("Error: could not decode frame header\n");
-                    detectError = 1;
-                    break;
-                }
+                ERROR_IF(ZSTD_getFrameHeader(&header, headerBuffer, numBytesRead) != 0,
+                        info_frame_error, "Error: could not decode frame header");
                 info->windowSize = header.windowSize;
                 /* move to the end of the frame header */
                 {   size_t const headerSize = ZSTD_frameHeaderSize(headerBuffer, numBytesRead);
-                    if (ZSTD_isError(headerSize)) {
-                        DISPLAY("Error: could not determine frame header size\n");
-                        detectError = 1;
-                        break;
-                    }
-                    {   int const ret = fseek(srcFile, ((long)headerSize)-((long)numBytesRead), SEEK_CUR);
-                        if (ret != 0) {
-                            DISPLAY("Error: could not move to end of frame header\n");
-                            detectError = 1;
-                            break;
-                }   }   }
+                    ERROR_IF(ZSTD_isError(headerSize), info_frame_error, "Error: could not determine frame header size");
+                    ERROR_IF(fseek(srcFile, ((long)headerSize)-((long)numBytesRead), SEEK_CUR) != 0,
+                            info_frame_error, "Error: could not move to end of frame header");
+                }
 
-                /* skip the rest of the blocks in the frame */
+                /* skip all blocks in the frame */
                 {   int lastBlock = 0;
                     do {
                         BYTE blockHeaderBuffer[3];
-                        size_t const readBytes = fread(blockHeaderBuffer, 1, 3, srcFile);
-                        if (readBytes != 3) {
-                            DISPLAY("There was a problem reading the block header\n");
-                            detectError = 1;
-                            break;
-                        }
+                        ERROR_IF(fread(blockHeaderBuffer, 1, 3, srcFile) != 3,
+                                info_frame_error, "Error while reading block header");
                         {   U32 const blockHeader = MEM_readLE24(blockHeaderBuffer);
                             U32 const blockTypeID = (blockHeader >> 1) & 3;
                             U32 const isRLE = (blockTypeID == 1);
                             U32 const isWrongBlock = (blockTypeID == 3);
                             long const blockSize = isRLE ? 1 : (long)(blockHeader >> 3);
-                            if (isWrongBlock) {
-                                DISPLAY("Error: unsupported block type \n");
-                                detectError = 1;
-                                break;
-                            }
+                            ERROR_IF(isWrongBlock, info_frame_error, "Error: unsupported block type");
                             lastBlock = blockHeader & 1;
-                            {   int const ret = fseek(srcFile, blockSize, SEEK_CUR);
-                                if (ret != 0) {
-                                    DISPLAY("Error: could not skip to end of block\n");
-                                    detectError = 1;
-                                    break;
-                        }   }   }
+                            ERROR_IF(fseek(srcFile, blockSize, SEEK_CUR) != 0,
+                                    info_frame_error, "Error: could not skip to end of block");
+                        }
                     } while (lastBlock != 1);
-
-                    if (detectError) break;
                 }
 
                 /* check if checksum is used */
                 {   BYTE const frameHeaderDescriptor = headerBuffer[4];
                     int const contentChecksumFlag = (frameHeaderDescriptor & (1 << 2)) >> 2;
                     if (contentChecksumFlag) {
-                        int const ret = fseek(srcFile, 4, SEEK_CUR);
                         info->usesCheck = 1;
-                        if (ret != 0) {
-                            DISPLAY("Error: could not skip past checksum\n");
-                            detectError = 1;
-                            break;
-                }   }   }
+                        ERROR_IF(fseek(srcFile, 4, SEEK_CUR) != 0,
+                                info_frame_error, "Error: could not skip past checksum");
+                }   }
                 info->numActualFrames++;
             }
             /* Skippable frame */
             else if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
                 U32 const frameSize = MEM_readLE32(headerBuffer + 4);
                 long const seek = (long)(8 + frameSize - numBytesRead);
-                int const ret = LONG_SEEK(srcFile, seek, SEEK_CUR);
-                if (ret != 0) {
-                    DISPLAY("Error: could not find end of skippable frame\n");
-                    detectError = 1;
-                    break;
-                }
+                ERROR_IF(LONG_SEEK(srcFile, seek, SEEK_CUR) != 0,
+                        info_frame_error, "Error: could not find end of skippable frame");
                 info->numSkippableFrames++;
             }
             /* unknown content */
             else {
-                detectError = 2;
-                break;
+                return info_not_zstd;
             }
-        }
-    }  /* end analyzing frame */
-    fclose(srcFile);
-    info->nbFiles = 1;
-    return detectError;
+        }  /* magic number analysis */
+    }  /* end analyzing frames */
+    return info_success;
 }
 
-static int getFileInfo(fileInfo_t* info, const char* srcFileName)
+
+static InfoError
+getFileInfo_fileConfirmed(fileInfo_t* info, const char* inFileName)
 {
-    int const isAFile = UTIL_isRegularFile(srcFileName);
-    if (!isAFile) {
-        DISPLAY("Error : %s is not a file", srcFileName);
-        return 3;
-    }
+    InfoError status;
+    FILE* const srcFile = FIO_openSrcFile(inFileName);
+    ERROR_IF(srcFile == NULL, info_file_error, "Error: could not open source file %s", inFileName);
+
+    info->compressedSize = UTIL_getFileSize(inFileName);
+    status = FIO_analyzeFrames(info, srcFile);
+
+    fclose(srcFile);
+    info->nbFiles = 1;
+    return status;
+}
+
+
+/** getFileInfo() :
+ *  Reads information from file, stores in *info
+ * @return : InfoError status
+ */
+static InfoError
+getFileInfo(fileInfo_t* info, const char* srcFileName)
+{
+    ERROR_IF(!UTIL_isRegularFile(srcFileName),
+            info_file_error, "Error : %s is not a file", srcFileName);
     return getFileInfo_fileConfirmed(info, srcFileName);
 }
 
 
-static void displayInfo(const char* inFileName, const fileInfo_t* info, int displayLevel){
+static void
+displayInfo(const char* inFileName, const fileInfo_t* info, int displayLevel)
+{
     unsigned const unit = info->compressedSize < (1 MB) ? (1 KB) : (1 MB);
     const char* const unitStr = info->compressedSize < (1 MB) ? "KB" : "MB";
     double const windowSizeUnit = (double)info->windowSize / unit;
@@ -1987,46 +2270,62 @@ static fileInfo_t FIO_addFInfo(fileInfo_t fi1, fileInfo_t fi2)
     return total;
 }
 
-static int FIO_listFile(fileInfo_t* total, const char* inFileName, int displayLevel){
+static int
+FIO_listFile(fileInfo_t* total, const char* inFileName, int displayLevel)
+{
     fileInfo_t info;
     memset(&info, 0, sizeof(info));
-    {   int const error = getFileInfo(&info, inFileName);
-        if (error == 1) {
+    {   InfoError const error = getFileInfo(&info, inFileName);
+        if (error == info_frame_error) {
             /* display error, but provide output */
-            DISPLAY("An error occurred while getting file info \n");
+            DISPLAYLEVEL(1, "Error while parsing %s \n", inFileName);
         }
-        else if (error == 2) {
+        else if (error == info_not_zstd) {
             DISPLAYOUT("File %s not compressed by zstd \n", inFileName);
             if (displayLevel > 2) DISPLAYOUT("\n");
             return 1;
         }
-        else if (error == 3) {
+        else if (error == info_file_error) {
             /* error occurred while opening the file */
             if (displayLevel > 2) DISPLAYOUT("\n");
             return 1;
         }
         displayInfo(inFileName, &info, displayLevel);
         *total = FIO_addFInfo(*total, info);
+        assert(error>=0 || error<=1);
         return error;
     }
 }
 
-int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel){
+int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel)
+{
+    /* ensure no specified input is stdin (needs fseek() capability) */
+    {   unsigned u;
+        for (u=0; u<numFiles;u++) {
+            ERROR_IF(!strcmp (filenameTable[u], stdinmark),
+                    1, "zstd: --list does not support reading from standard input");
+    }   }
+
     if (numFiles == 0) {
-        DISPLAYOUT("No files given\n");
-        return 0;
+        if (!IS_CONSOLE(stdin)) {
+            DISPLAYLEVEL(1, "zstd: --list does not support reading from standard input \n");
+        }
+        DISPLAYLEVEL(1, "No files given \n");
+        return 1;
     }
+
     if (displayLevel <= 2) {
         DISPLAYOUT("Frames  Skips  Compressed  Uncompressed  Ratio  Check  Filename\n");
     }
     {   int error = 0;
-        unsigned u;
         fileInfo_t total;
         memset(&total, 0, sizeof(total));
         total.usesCheck = 1;
-        for (u=0; u<numFiles;u++) {
-            error |= FIO_listFile(&total, filenameTable[u], displayLevel);
-        }
+        /* --list each file, and check for any error */
+        {   unsigned u;
+            for (u=0; u<numFiles;u++) {
+                error |= FIO_listFile(&total, filenameTable[u], displayLevel);
+        }   }
         if (numFiles > 1 && displayLevel <= 2) {   /* display total */
             unsigned const unit = total.compressedSize < (1 MB) ? (1 KB) : (1 MB);
             const char* const unitStr = total.compressedSize < (1 MB) ? "KB" : "MB";
diff --git a/programs/fileio.h b/programs/fileio.h
index 69c83f71dce3..4c7049cb7167 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -48,20 +48,23 @@ typedef enum { FIO_zstdCompression, FIO_gzipCompression, FIO_xzCompression, FIO_
 ***************************************/
 void FIO_setCompressionType(FIO_compressionType_t compressionType);
 void FIO_overwriteMode(void);
-void FIO_setNotificationLevel(unsigned level);
-void FIO_setSparseWrite(unsigned sparse);  /**< 0: no sparse; 1: disable on stdout; 2: always enabled */
-void FIO_setDictIDFlag(unsigned dictIDFlag);
-void FIO_setChecksumFlag(unsigned checksumFlag);
-void FIO_setRemoveSrcFile(unsigned flag);
-void FIO_setMemLimit(unsigned memLimit);
-void FIO_setNbWorkers(unsigned nbWorkers);
+void FIO_setAdaptiveMode(unsigned adapt);
+void FIO_setAdaptMin(int minCLevel);
+void FIO_setAdaptMax(int maxCLevel);
 void FIO_setBlockSize(unsigned blockSize);
-void FIO_setOverlapLog(unsigned overlapLog);
+void FIO_setChecksumFlag(unsigned checksumFlag);
+void FIO_setDictIDFlag(unsigned dictIDFlag);
+void FIO_setLdmBucketSizeLog(unsigned ldmBucketSizeLog);
 void FIO_setLdmFlag(unsigned ldmFlag);
+void FIO_setLdmHashEveryLog(unsigned ldmHashEveryLog);
 void FIO_setLdmHashLog(unsigned ldmHashLog);
 void FIO_setLdmMinMatch(unsigned ldmMinMatch);
-void FIO_setLdmBucketSizeLog(unsigned ldmBucketSizeLog);
-void FIO_setLdmHashEveryLog(unsigned ldmHashEveryLog);
+void FIO_setMemLimit(unsigned memLimit);
+void FIO_setNbWorkers(unsigned nbWorkers);
+void FIO_setNotificationLevel(unsigned level);
+void FIO_setOverlapLog(unsigned overlapLog);
+void FIO_setRemoveSrcFile(unsigned flag);
+void FIO_setSparseWrite(unsigned sparse);  /**< 0: no sparse; 1: disable on stdout; 2: always enabled */
 
 
 /*-*************************************
@@ -70,7 +73,7 @@ void FIO_setLdmHashEveryLog(unsigned ldmHashEveryLog);
 /** FIO_compressFilename() :
     @return : 0 == ok;  1 == pb with src file. */
 int FIO_compressFilename (const char* outfilename, const char* infilename, const char* dictFileName,
-                          int compressionLevel, ZSTD_compressionParameters* comprParams);
+                          int compressionLevel, ZSTD_compressionParameters comprParams);
 
 /** FIO_decompressFilename() :
     @return : 0 == ok;  1 == pb with src file. */
@@ -78,6 +81,7 @@ int FIO_decompressFilename (const char* outfilename, const char* infilename, con
 
 int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int displayLevel);
 
+
 /*-*************************************
 *  Multiple File functions
 ***************************************/
@@ -86,7 +90,7 @@ int FIO_listMultipleFiles(unsigned numFiles, const char** filenameTable, int dis
 int FIO_compressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles,
                                   const char* outFileName, const char* suffix,
                                   const char* dictFileName, int compressionLevel,
-                                  ZSTD_compressionParameters* comprParams);
+                                  ZSTD_compressionParameters comprParams);
 
 /** FIO_decompressMultipleFilenames() :
     @return : nb of missing or skipped files */
@@ -95,6 +99,15 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
                                     const char* dictFileName);
 
 
+/*-*************************************
+*  Advanced stuff (should actually be hosted elsewhere)
+***************************************/
+
+/* custom crash signal handler */
+void FIO_addAbortHandler(void);
+
+
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/platform.h b/programs/platform.h
index c86d289f5414..155ebcd1eb9c 100644
--- a/programs/platform.h
+++ b/programs/platform.h
@@ -50,53 +50,70 @@ extern "C" {
 /* *********************************************************
 *  Turn on Large Files support (>4GB) for 32-bit Linux/Unix
 ***********************************************************/
-#if !defined(__64BIT__) || defined(__MINGW32__)       /* No point defining Large file for 64 bit but MinGW-w64 requires it */
+#if !defined(__64BIT__) || defined(__MINGW32__)    /* No point defining Large file for 64 bit but MinGW-w64 requires it */
 #  if !defined(_FILE_OFFSET_BITS)
-#    define _FILE_OFFSET_BITS 64                      /* turn off_t into a 64-bit type for ftello, fseeko */
+#    define _FILE_OFFSET_BITS 64                   /* turn off_t into a 64-bit type for ftello, fseeko */
 #  endif
-#  if !defined(_LARGEFILE_SOURCE)                     /* obsolete macro, replaced with _FILE_OFFSET_BITS */
-#    define _LARGEFILE_SOURCE 1                       /* Large File Support extension (LFS) - fseeko, ftello */
+#  if !defined(_LARGEFILE_SOURCE)                  /* obsolete macro, replaced with _FILE_OFFSET_BITS */
+#    define _LARGEFILE_SOURCE 1                    /* Large File Support extension (LFS) - fseeko, ftello */
 #  endif
 #  if defined(_AIX) || defined(__hpux)
-#    define _LARGE_FILES                              /* Large file support on 32-bits AIX and HP-UX */
+#    define _LARGE_FILES                           /* Large file support on 32-bits AIX and HP-UX */
 #  endif
 #endif
 
 
 /* ************************************************************
 *  Detect POSIX version
-*  PLATFORM_POSIX_VERSION = -1 for non-Unix e.g. Windows
-*  PLATFORM_POSIX_VERSION = 0 for Unix-like non-POSIX
-*  PLATFORM_POSIX_VERSION >= 1 is equal to found _POSIX_VERSION
+*  PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows
+*  PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX
+*  PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION
+*  Value of PLATFORM_POSIX_VERSION can be forced on command line
 ***************************************************************/
-#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \
-   || defined(__midipix__) || defined(__VMS))
+#ifndef PLATFORM_POSIX_VERSION
+
 #  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
      || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
+     /* exception rule : force posix version to 200112L,
+      * note: it's better to use unistd.h's _POSIX_VERSION whenever possible */
 #    define PLATFORM_POSIX_VERSION 200112L
-#  else
+
+/* try to determine posix version through official unistd.h's _POSIX_VERSION (http://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html).
+ * note : there is no simple way to know in advance if <unistd.h> is present or not on target system,
+ * Posix specification mandates its presence and its content, but target system must respect this spec.
+ * It's necessary to _not_ #include <unistd.h> whenever target OS is not unix-like
+ * otherwise it will block preprocessing stage.
+ * The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include <unistd.h>
+ */
+#  elif !defined(_WIN32) \
+     && (defined(__unix__) || defined(__unix) \
+     || defined(__midipix__) || defined(__VMS) || defined(__HAIKU__))
+
 #    if defined(__linux__) || defined(__linux)
 #      ifndef _POSIX_C_SOURCE
-#        define _POSIX_C_SOURCE 200112L  /* use feature test macro */
+#        define _POSIX_C_SOURCE 200112L  /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */
 #      endif
 #    endif
 #    include <unistd.h>  /* declares _POSIX_VERSION */
 #    if defined(_POSIX_VERSION)  /* POSIX compliant */
 #      define PLATFORM_POSIX_VERSION _POSIX_VERSION
 #    else
-#      define PLATFORM_POSIX_VERSION 0
+#      define PLATFORM_POSIX_VERSION 1
 #    endif
-#  endif
-#endif
-#if !defined(PLATFORM_POSIX_VERSION)
-#  define PLATFORM_POSIX_VERSION -1
-#endif
 
+#  else  /* non-unix target platform (like Windows) */
+#    define PLATFORM_POSIX_VERSION 0
+#  endif
+
+#endif   /* PLATFORM_POSIX_VERSION */
 
 /*-*********************************************
 *  Detect if isatty() and fileno() are available
 ************************************************/
-#if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) || (PLATFORM_POSIX_VERSION >= 200112L) || defined(__DJGPP__)
+#if (defined(__linux__) && (PLATFORM_POSIX_VERSION > 1)) \
+ || (PLATFORM_POSIX_VERSION >= 200112L) \
+ || defined(__DJGPP__) \
+ || defined(__MSYS__)
 #  include <unistd.h>   /* isatty */
 #  define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
 #elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__)
@@ -145,6 +162,34 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
 #endif
 
 
+#ifndef ZSTD_START_SYMBOLLIST_FRAME
+#  ifdef __linux__
+#    define ZSTD_START_SYMBOLLIST_FRAME 2
+#  elif defined __APPLE__
+#    define ZSTD_START_SYMBOLLIST_FRAME 4
+#  else
+#    define ZSTD_START_SYMBOLLIST_FRAME 0
+#  endif
+#endif
+
+
+#ifndef ZSTD_SETPRIORITY_SUPPORT
+   /* mandates presence of <sys/resource.h> and support for setpriority() : http://man7.org/linux/man-pages/man2/setpriority.2.html */
+#  define ZSTD_SETPRIORITY_SUPPORT (PLATFORM_POSIX_VERSION >= 200112L)
+#endif
+
+
+#ifndef ZSTD_NANOSLEEP_SUPPORT
+   /* mandates support of nanosleep() within <time.h> : http://man7.org/linux/man-pages/man2/nanosleep.2.html */
+#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) \
+   || (PLATFORM_POSIX_VERSION >= 200112L)
+#     define ZSTD_NANOSLEEP_SUPPORT 1
+#  else
+#     define ZSTD_NANOSLEEP_SUPPORT 0
+#  endif
+#endif
+
+
 #if defined (__cplusplus)
 }
 #endif
diff --git a/programs/util.h b/programs/util.h
index 3e69745793e4..67aa7a56b967 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -20,13 +20,13 @@ extern "C" {
 /*-****************************************
 *  Dependencies
 ******************************************/
-#include "platform.h"     /* PLATFORM_POSIX_VERSION */
+#include "platform.h"     /* PLATFORM_POSIX_VERSION, ZSTD_NANOSLEEP_SUPPORT, ZSTD_SETPRIORITY_SUPPORT */
 #include <stdlib.h>       /* malloc */
 #include <stddef.h>       /* size_t, ptrdiff_t */
 #include <stdio.h>        /* fprintf */
 #include <string.h>       /* strncmp */
 #include <sys/types.h>    /* stat, utime */
-#include <sys/stat.h>     /* stat */
+#include <sys/stat.h>     /* stat, chmod */
 #if defined(_MSC_VER)
 #  include <sys/utime.h>  /* utime */
 #  include <io.h>         /* _chmod */
@@ -40,7 +40,7 @@ extern "C" {
 
 
 /* ************************************************************
-* Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
+* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
 ***************************************************************/
 #if defined(_MSC_VER) && (_MSC_VER >= 1400)
 #   define UTIL_fseek _fseeki64
@@ -53,32 +53,34 @@ extern "C" {
 #endif
 
 
-/*-****************************************
-*  Sleep functions: Windows - Posix - others
-******************************************/
+/*-*************************************************
+*  Sleep & priority functions: Windows - Posix - others
+***************************************************/
 #if defined(_WIN32)
 #  include <windows.h>
 #  define SET_REALTIME_PRIORITY SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS)
 #  define UTIL_sleep(s) Sleep(1000*s)
 #  define UTIL_sleepMilli(milli) Sleep(milli)
-#elif PLATFORM_POSIX_VERSION >= 0 /* Unix-like operating system */
-#  include <unistd.h>
-#  include <sys/resource.h> /* setpriority */
-#  if defined(PRIO_PROCESS)
-#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
-#  else
-#    define SET_REALTIME_PRIORITY /* disabled */
-#  endif
+
+#elif PLATFORM_POSIX_VERSION > 0 /* Unix-like operating system */
+#  include <unistd.h>   /* sleep */
 #  define UTIL_sleep(s) sleep(s)
-#  if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 199309L)) || (PLATFORM_POSIX_VERSION >= 200112L)  /* nanosleep requires POSIX.1-2001 */
+#  if ZSTD_NANOSLEEP_SUPPORT   /* necessarily defined in platform.h */
 #      define UTIL_sleepMilli(milli) { struct timespec t; t.tv_sec=0; t.tv_nsec=milli*1000000ULL; nanosleep(&t, NULL); }
 #  else
 #      define UTIL_sleepMilli(milli) /* disabled */
 #  endif
-#else
-#  define SET_REALTIME_PRIORITY      /* disabled */
+#  if ZSTD_SETPRIORITY_SUPPORT
+#    include <sys/resource.h> /* setpriority */
+#    define SET_REALTIME_PRIORITY setpriority(PRIO_PROCESS, 0, -20)
+#  else
+#    define SET_REALTIME_PRIORITY /* disabled */
+#  endif
+
+#else  /* unknown non-unix operating systen */
 #  define UTIL_sleep(s)          /* disabled */
 #  define UTIL_sleepMilli(milli) /* disabled */
+#  define SET_REALTIME_PRIORITY  /* disabled */
 #endif
 
 
@@ -119,6 +121,7 @@ static int g_utilDisplayLevel;
 #if defined(_WIN32)   /* Windows */
     #define UTIL_TIME_INITIALIZER { { 0, 0 } }
     typedef LARGE_INTEGER UTIL_time_t;
+
     UTIL_STATIC UTIL_time_t UTIL_getTime(void) { UTIL_time_t x; QueryPerformanceCounter(&x); return x; }
     UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
     {
@@ -148,6 +151,7 @@ static int g_utilDisplayLevel;
     #include <mach/mach_time.h>
     #define UTIL_TIME_INITIALIZER 0
     typedef U64 UTIL_time_t;
+
     UTIL_STATIC UTIL_time_t UTIL_getTime(void) { return mach_absolute_time(); }
     UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t clockStart, UTIL_time_t clockEnd)
     {
@@ -170,11 +174,16 @@ static int g_utilDisplayLevel;
         return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom);
     }
 
-#elif (PLATFORM_POSIX_VERSION >= 200112L) && (defined __UCLIBC__ || ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) || __GLIBC__ > 2))
+#elif (PLATFORM_POSIX_VERSION >= 200112L) \
+   && (defined(__UCLIBC__)                \
+      || (defined(__GLIBC__)              \
+          && ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) \
+             || (__GLIBC__ > 2))))
 
     #define UTIL_TIME_INITIALIZER { 0, 0 }
     typedef struct timespec UTIL_freq_t;
     typedef struct timespec UTIL_time_t;
+
     UTIL_STATIC UTIL_time_t UTIL_getTime(void)
     {
         UTIL_time_t time;
@@ -182,6 +191,7 @@ static int g_utilDisplayLevel;
             UTIL_DISPLAYLEVEL(1, "ERROR: Failed to get time\n");   /* we could also exit() */
         return time;
     }
+
     UTIL_STATIC UTIL_time_t UTIL_getSpanTime(UTIL_time_t begin, UTIL_time_t end)
     {
         UTIL_time_t diff;
@@ -194,6 +204,7 @@ static int g_utilDisplayLevel;
         }
         return diff;
     }
+
     UTIL_STATIC U64 UTIL_getSpanTimeMicro(UTIL_time_t begin, UTIL_time_t end)
     {
         UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
@@ -202,6 +213,7 @@ static int g_utilDisplayLevel;
         micro += diff.tv_nsec / 1000ULL;
         return micro;
     }
+
     UTIL_STATIC U64 UTIL_getSpanTimeNano(UTIL_time_t begin, UTIL_time_t end)
     {
         UTIL_time_t const diff = UTIL_getSpanTime(begin, end);
@@ -210,6 +222,7 @@ static int g_utilDisplayLevel;
         nano += diff.tv_nsec;
         return nano;
     }
+
 #else   /* relies on standard C (note : clock_t measurements can be wrong when using multi-threading) */
     typedef clock_t UTIL_time_t;
     #define UTIL_TIME_INITIALIZER 0
@@ -319,15 +332,20 @@ UTIL_STATIC U32 UTIL_isDirectory(const char* infilename)
 
 UTIL_STATIC U32 UTIL_isLink(const char* infilename)
 {
-#if defined(_WIN32)
-    /* no symlinks on windows */
-    (void)infilename;
-#else
+/* macro guards, as defined in : https://linux.die.net/man/2/lstat */
+#ifndef __STRICT_ANSI__
+#if defined(_BSD_SOURCE) \
+    || (defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE >= 500)) \
+    || (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) \
+    || (defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) \
+    || (defined(__APPLE__) && defined(__MACH__))
     int r;
     stat_t statbuf;
     r = lstat(infilename, &statbuf);
     if (!r && S_ISLNK(statbuf.st_mode)) return 1;
 #endif
+#endif
+    (void)infilename;
     return 0;
 }
 
@@ -513,7 +531,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
 
 UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
-    (void)bufStart; (void)bufEnd; (void)pos;
+    (void)bufStart; (void)bufEnd; (void)pos; (void)followLinks;
     UTIL_DISPLAYLEVEL(1, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName);
     return 0;
 }
@@ -526,7 +544,10 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
  * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
  * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
  */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
+UTIL_STATIC const char**
+UTIL_createFileList(const char **inputNames, unsigned inputNamesNb,
+                    char** allocatedBuffer, unsigned* allocatedNamesNb,
+                    int followLinks)
 {
     size_t pos;
     unsigned i, nbFiles;
diff --git a/programs/zstd.1 b/programs/zstd.1
index 8e9e83745a0f..674f89841ce8 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -1,5 +1,5 @@
 .
-.TH "ZSTD" "1" "2018-01-27" "zstd 1.3.4" "User Commands"
+.TH "ZSTD" "1" "October 2018" "zstd 1.3.7" "User Commands"
 .
 .SH "NAME"
 \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
@@ -17,7 +17,7 @@
 \fBzstdcat\fR is equivalent to \fBzstd \-dcf\fR
 .
 .SH "DESCRIPTION"
-\fBzstd\fR is a fast lossless compression algorithm and data compression tool, with command line syntax similar to \fBgzip (1)\fR and \fBxz (1)\fR\. It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages\. \fBzstd\fR offers highly configurable compression speed, with fast modes at > 200 MB/s per code, and strong modes nearing lzma compression ratios\. It also features a very fast decoder, with speeds > 500 MB/s per core\.
+\fBzstd\fR is a fast lossless compression algorithm and data compression tool, with command line syntax similar to \fBgzip (1)\fR and \fBxz (1)\fR\. It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages\. \fBzstd\fR offers highly configurable compression speed, with fast modes at > 200 MB/s per core, and strong modes nearing lzma compression ratios\. It also features a very fast decoder, with speeds > 500 MB/s per core\.
 .
 .P
 \fBzstd\fR command line syntax is generally similar to gzip, but features the following differences :
@@ -100,6 +100,10 @@ Display information related to a zstd compressed file, such as size, ratio, and
 \fB#\fR compression level [1\-19] (default: 3)
 .
 .TP
+\fB\-\-fast[=#]\fR
+switch to ultra\-fast compression levels\. If \fB=#\fR is not present, it defaults to \fB1\fR\. The higher the value, the faster the compression speed, at the cost of some compression ratio\. This setting overwrites compression level if one was set previously\. Similarly, if a compression level is set after \fB\-\-fast\fR, it overrides it\.
+.
+.TP
 \fB\-\-ultra\fR
 unlocks high compression levels 20+ (maximum 22), using a lot more memory\. Note that decompression will also require more memory when using these levels\.
 .
@@ -112,14 +116,22 @@ Note: If \fBwindowLog\fR is set to larger than 27, \fB\-\-long=windowLog\fR or \
 .
 .TP
 \fB\-T#\fR, \fB\-\-threads=#\fR
-Compress using \fB#\fR threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==256\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
+Compress using \fB#\fR working threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==200\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
+.
+.TP
+\fB\-\-single\-thread\fR
+Does not spawn a thread for compression, use a single thread for both I/O and compression\. In this mode, compression is serialized with I/O, which is slightly slower\. (This is different from \fB\-T1\fR, which spawns 1 compression thread in parallel of I/O)\. This mode is the only one available when multithread support is disabled\. Single\-thread mode features lower memory usage\. Final compressed result is slightly different from \fB\-T1\fR\.
+.
+.TP
+\fB\-\-adapt[=min=#,max=#]\fR
+\fBzstd\fR will dynamically adapt compression level to perceived I/O conditions\. Compression level adaptation can be observed live by using command \fB\-v\fR\. Adaptation can be constrained between supplied \fBmin\fR and \fBmax\fR levels\. The feature works when combined with multi\-threading and \fB\-\-long\fR mode\. It does not work with \fB\-\-single\-thread\fR\. It sets window size to 8 MB by default (can be changed manually, see \fBwlog\fR)\. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible\. \fInote\fR : at the time of this writing, \fB\-\-adapt\fR can remain stuck at low speed when combined with multiple worker threads (>=2)\.
 .
 .TP
 \fB\-D file\fR
 use \fBfile\fR as Dictionary to compress or decompress FILE(s)
 .
 .TP
-\fB\-\-nodictID\fR
+\fB\-\-no\-dictID\fR
 do not store dictionary ID within frame header (dictionary compression)\. The decoder will have to rely on implicit knowledge about which dictionary to use, it won\'t be able to check if it\'s correct\.
 .
 .TP
@@ -152,7 +164,7 @@ operate recursively on dictionaries
 .
 .TP
 \fB\-\-format=FORMAT\fR
-compress and decompress in other formats\. If compiled with support, zstd can compress to or decompress from other compression algorithm formats\. Possibly available options are \fBgzip\fR, \fBxz\fR, \fBlzma\fR, and \fBlz4\fR\.
+compress and decompress in other formats\. If compiled with support, zstd can compress to or decompress from other compression algorithm formats\. Possibly available options are \fBzstd\fR, \fBgzip\fR, \fBxz\fR, \fBlzma\fR, and \fBlz4\fR\. If no such format is provided, \fBzstd\fR is the default\.
 .
 .TP
 \fB\-h\fR/\fB\-H\fR, \fB\-\-help\fR
@@ -186,7 +198,7 @@ All arguments after \fB\-\-\fR are treated as files
 Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
 .
 .IP
-Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-fastcover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. The cover dictionary builder can be accessed with \fB\-\-train\-cover\fR\. Equivalent to \fB\-\-train\-fastcover=d=8,steps=4\fR\.
 .
 .TP
 \fB\-o file\fR
@@ -209,11 +221,11 @@ Split input files in blocks of size # (default: no split)
 A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\.
 .
 .TP
-\fB\-\-train\-cover[=k#,d=#,steps=#]\fR
-Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\.
+\fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR
+Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or split <= 0, then the default value of 100 is used\. Requires that \fId\fR <= \fIk\fR\.
 .
 .IP
-Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
+Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. If \fIsplit\fR is 100, all input samples are used for both training and testing to find optimal \fId\fR and \fIk\fR to build dictionary\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
 .
 .IP
 Examples:
@@ -230,6 +242,25 @@ Examples:
 .IP
 \fBzstd \-\-train\-cover=k=50 FILEs\fR
 .
+.IP
+\fBzstd \-\-train\-cover=k=50,split=60 FILEs\fR
+.
+.TP
+\fB\-\-train\-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]\fR
+Same as cover but with extra parameters \fIf\fR and \fIaccel\fR and different default value of split If \fIsplit\fR is not specified, then it tries \fIsplit\fR = 75\. If \fIf\fR is not specified, then it tries \fIf\fR = 20\. Requires that 0 < \fIf\fR < 32\. If \fIaccel\fR is not specified, then it tries \fIaccel\fR = 1\. Requires that 0 < \fIaccel\fR <= 10\. Requires that \fId\fR = 6 or \fId\fR = 8\.
+.
+.IP
+\fIf\fR is log of size of array that keeps track of frequency of subsegments of size \fId\fR\. The subsegment is hashed to an index in the range [0,2^\fIf\fR \- 1]\. It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency\. Using a higher \fIf\fR reduces collision but takes longer\.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-fastcover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-fastcover=d=8,f=15,accel=2 FILEs\fR
+.
 .TP
 \fB\-\-train\-legacy[=selectivity=#]\fR
 Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
@@ -335,13 +366,19 @@ The minimum \fIslen\fR is 3 and the maximum is 7\.
 .
 .TP
 \fBtargetLen\fR=\fItlen\fR, \fBtlen\fR=\fItlen\fR
-Specify the minimum match length that causes a match finder to stop searching for better matches\.
+The impact of this field vary depending on selected strategy\.
 .
 .IP
-A larger minimum match length usually improves compression ratio but decreases compression speed\. This option is only used with strategies ZSTD_btopt and ZSTD_btultra\.
+For ZSTD_btopt and ZSTD_btultra, it specifies the minimum match length that causes match finder to stop searching for better matches\. A larger \fBtargetLen\fR usually improves compression ratio but decreases compression speed\.
 .
 .IP
-The minimum \fItlen\fR is 4 and the maximum is 999\.
+For ZSTD_fast, it triggers ultra\-fast mode when > 0\. The value represents the amount of data skipped between match sampling\. Impact is reversed : a larger \fBtargetLen\fR increases compression speed but decreases compression ratio\.
+.
+.IP
+For all other strategies, this field has no impact\.
+.
+.IP
+The minimum \fItlen\fR is 0 and the maximum is 999\.
 .
 .TP
 \fBoverlapLog\fR=\fIovlog\fR, \fBovlog\fR=\fIovlog\fR
@@ -374,7 +411,7 @@ This option is ignored unless long distance matching is enabled\.
 Larger/very small values usually decrease compression ratio\.
 .
 .IP
-The minumum \fIldmslen\fR is 4 and the maximum is 4096 (default: 64)\.
+The minimum \fIldmslen\fR is 4 and the maximum is 4096 (default: 64)\.
 .
 .TP
 \fBldmBucketSizeLog\fR=\fIldmblog\fR, \fBldmblog\fR=\fIldmblog\fR
@@ -402,14 +439,14 @@ Larger values will improve compression speed\. Deviating far from the default va
 .IP
 The default value is \fBwlog \- ldmhlog\fR\.
 .
-.SS "\-B#:"
-Select the size of each compression job\. This parameter is available only when multi\-threading is enabled\. Default value is \fB4 * windowSize\fR, which means it varies depending on compression level\. \fB\-B#\fR makes it possible to select a custom value\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 1 MB, or \fBoverlapSize\fR, whichever is largest\.
-.
 .SS "Example"
-The following parameters sets advanced compression options to those of predefined level 19 for files bigger than 256 KB:
+The following parameters sets advanced compression options to something similar to predefined level 19 for files bigger than 256 KB:
 .
 .P
-\fB\-\-zstd\fR=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
+\fB\-\-zstd\fR=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6
+.
+.SS "\-B#:"
+Select the size of each compression job\. This parameter is available only when multi\-threading is enabled\. Default value is \fB4 * windowSize\fR, which means it varies depending on compression level\. \fB\-B#\fR makes it possible to select a custom value\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 1 MB, or \fBoverlapSize\fR, whichever is largest\.
 .
 .SH "BUGS"
 Report bugs at: https://github\.com/facebook/zstd/issues
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
index 2e2dc54f8668..c0c04698ddc2 100644
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -19,7 +19,7 @@ DESCRIPTION
 with command line syntax similar to `gzip (1)` and `xz (1)`.
 It is based on the **LZ77** family, with further FSE & huff0 entropy stages.
 `zstd` offers highly configurable compression speed,
-with fast modes at > 200 MB/s per code,
+with fast modes at > 200 MB/s per core,
 and strong modes nearing lzma compression ratios.
 It also features a very fast decoder, with speeds > 500 MB/s per core.
 
@@ -102,6 +102,13 @@ the last one takes effect.
 
 * `-#`:
     `#` compression level \[1-19] (default: 3)
+* `--fast[=#]`:
+    switch to ultra-fast compression levels.
+    If `=#` is not present, it defaults to `1`.
+    The higher the value, the faster the compression speed,
+    at the cost of some compression ratio.
+    This setting overwrites compression level if one was set previously.
+    Similarly, if a compression level is set after `--fast`, it overrides it.
 * `--ultra`:
     unlocks high compression levels 20+ (maximum 22), using a lot more memory.
     Note that decompression will also require more memory when using these levels.
@@ -115,28 +122,31 @@ the last one takes effect.
 
     Note: If `windowLog` is set to larger than 27, `--long=windowLog` or
     `--memory=windowSize` needs to be passed to the decompressor.
-* `--fast[=#]`:
-    switch to ultra-fast compression levels.
-    If `=#` is not present, it defaults to `1`.
-    The higher the value, the faster the compression speed,
-    at the cost of some compression ratio.
-    This setting overwrites compression level if one was set previously.
-    Similarly, if a compression level is set after `--fast`, it overrides it.
-
 * `-T#`, `--threads=#`:
     Compress using `#` working threads (default: 1).
     If `#` is 0, attempt to detect and use the number of physical CPU cores.
     In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==200.
     This modifier does nothing if `zstd` is compiled without multithread support.
 * `--single-thread`:
-    Does not spawn a thread for compression, use caller thread instead.
-    This is the only available mode when multithread support is disabled.
-    In this mode, compression is serialized with I/O.
+    Does not spawn a thread for compression, use a single thread for both I/O and compression.
+    In this mode, compression is serialized with I/O, which is slightly slower.
     (This is different from `-T1`, which spawns 1 compression thread in parallel of I/O).
-    Single-thread mode also features lower memory usage.
+    This mode is the only one available when multithread support is disabled.
+    Single-thread mode features lower memory usage.
+    Final compressed result is slightly different from `-T1`.
+* `--adapt[=min=#,max=#]` :
+    `zstd` will dynamically adapt compression level to perceived I/O conditions.
+    Compression level adaptation can be observed live by using command `-v`.
+    Adaptation can be constrained between supplied `min` and `max` levels.
+    The feature works when combined with multi-threading and `--long` mode.
+    It does not work with `--single-thread`.
+    It sets window size to 8 MB by default (can be changed manually, see `wlog`).
+    Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible.
+    _note_ : at the time of this writing, `--adapt` can remain stuck at low speed
+    when combined with multiple worker threads (>=2).
 * `-D file`:
     use `file` as Dictionary to compress or decompress FILE(s)
-* `--nodictID`:
+* `--no-dictID`:
     do not store dictionary ID within frame header (dictionary compression).
     The decoder will have to rely on implicit knowledge about which dictionary to use,
     it won't be able to check if it's correct.
@@ -164,7 +174,8 @@ the last one takes effect.
 * `--format=FORMAT`:
     compress and decompress in other formats. If compiled with
     support, zstd can compress to or decompress from other compression algorithm
-    formats. Possibly available options are `gzip`, `xz`, `lzma`, and `lz4`.
+    formats. Possibly available options are `zstd`, `gzip`, `xz`, `lzma`, and `lz4`.
+    If no such format is provided, `zstd` is the default.
 * `-h`/`-H`, `--help`:
     display help/long help and exit
 * `-V`, `--version`:
@@ -199,9 +210,10 @@ Compression of small files similar to the sample set will be greatly improved.
     (for example, 10 MB for a 100 KB dictionary).
 
     Supports multithreading if `zstd` is compiled with threading support.
-    Additional parameters can be specified with `--train-cover`.
+    Additional parameters can be specified with `--train-fastcover`.
     The legacy dictionary builder can be accessed with `--train-legacy`.
-    Equivalent to `--train-cover=d=8,steps=4`.
+    The cover dictionary builder can be accessed with `--train-cover`.
+    Equivalent to `--train-fastcover=d=8,steps=4`.
 * `-o file`:
     Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
@@ -222,11 +234,12 @@ Compression of small files similar to the sample set will be greatly improved.
     This compares favorably to 4 bytes default.
     However, it's up to the dictionary manager to not assign twice the same ID to
     2 different dictionaries.
-* `--train-cover[=k#,d=#,steps=#]`:
+* `--train-cover[=k#,d=#,steps=#,split=#]`:
     Select parameters for the default dictionary builder algorithm named cover.
     If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
     If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
     If _steps_ is not specified, then the default value of 40 is used.
+    If _split_ is not specified or split <= 0, then the default value of 100 is used.
     Requires that _d_ <= _k_.
 
     Selects segments of size _k_ with highest score to put in the dictionary.
@@ -236,6 +249,8 @@ Compression of small files similar to the sample set will be greatly improved.
     algorithm will run faster with d <= _8_.
     Good values for _k_ vary widely based on the input data, but a safe range is
     [2 * _d_, 2000].
+    If _split_ is 100, all input samples are used for both training and testing
+    to find optimal _d_ and _k_ to build dictionary.
     Supports multithreading if `zstd` is compiled with threading support.
 
     Examples:
@@ -248,6 +263,28 @@ Compression of small files similar to the sample set will be greatly improved.
 
     `zstd --train-cover=k=50 FILEs`
 
+    `zstd --train-cover=k=50,split=60 FILEs`
+
+* `--train-fastcover[=k#,d=#,f=#,steps=#,split=#,accel=#]`:
+    Same as cover but with extra parameters _f_ and _accel_ and different default value of split
+    If _split_ is not specified, then it tries _split_ = 75.
+    If _f_ is not specified, then it tries _f_ = 20.
+    Requires that 0 < _f_ < 32.
+    If _accel_ is not specified, then it tries _accel_ = 1.
+    Requires that 0 < _accel_ <= 10.
+    Requires that _d_ = 6 or _d_ = 8.
+
+    _f_ is log of size of array that keeps track of frequency of subsegments of size _d_.
+    The subsegment is hashed to an index in the range [0,2^_f_ - 1].
+    It is possible that 2 different subsegments are hashed to the same index, and they are considered as the same subsegment when computing frequency.
+    Using a higher _f_ reduces collision but takes longer.
+
+    Examples:
+
+    `zstd --train-fastcover FILEs`
+
+    `zstd --train-fastcover=d=8,f=15,accel=2 FILEs`
+
 * `--train-legacy[=selectivity=#]`:
     Use legacy dictionary builder algorithm with the given dictionary
     _selectivity_ (default: 9).
@@ -354,14 +391,14 @@ The list of available _options_:
     A larger `targetLen` usually improves compression ratio
     but decreases compression speed.
 
-    For ZSTD\_fast, it specifies
-    the amount of data skipped between match sampling.
+    For ZSTD\_fast, it triggers ultra-fast mode when > 0.
+    The value represents the amount of data skipped between match sampling.
     Impact is reversed : a larger `targetLen` increases compression speed
     but decreases compression ratio.
 
     For all other strategies, this field has no impact.
 
-    The minimum _tlen_ is 1 and the maximum is 999.
+    The minimum _tlen_ is 0 and the maximum is 999.
 
 - `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
     Determine `overlapSize`, amount of data reloaded from previous job.
@@ -392,7 +429,7 @@ The list of available _options_:
 
     Larger/very small values usually decrease compression ratio.
 
-    The minumum _ldmslen_ is 4 and the maximum is 4096 (default: 64).
+    The minimum _ldmslen_ is 4 and the maximum is 4096 (default: 64).
 
 - `ldmBucketSizeLog`=_ldmblog_, `ldmblog`=_ldmblog_:
     Specify the size of each bucket for the hash table used for long distance
@@ -416,6 +453,12 @@ The list of available _options_:
 
     The default value is `wlog - ldmhlog`.
 
+### Example
+The following parameters sets advanced compression options to something
+similar to predefined level 19 for files bigger than 256 KB:
+
+`--zstd`=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6
+
 ### -B#:
 Select the size of each compression job.
 This parameter is available only when multi-threading is enabled.
@@ -424,12 +467,6 @@ Default value is `4 * windowSize`, which means it varies depending on compressio
 Note that job size must respect a minimum value which is enforced transparently.
 This minimum is either 1 MB, or `overlapSize`, whichever is largest.
 
-### Example
-The following parameters sets advanced compression options to those of
-predefined level 19 for files bigger than 256 KB:
-
-`--zstd`=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
-
 BUGS
 ----
 Report bugs at: https://github.com/facebook/zstd/issues
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index c35de7ccfbbd..1545d1cac579 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -32,13 +32,13 @@
 #include <errno.h>    /* errno */
 #include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
 #ifndef ZSTD_NOBENCH
-#  include "bench.h"  /* BMK_benchFiles, BMK_SetNbSeconds */
+#  include "bench.h"  /* BMK_benchFiles */
 #endif
 #ifndef ZSTD_NODICT
 #  include "dibio.h"  /* ZDICT_cover_params_t, DiB_trainFromFiles() */
 #endif
-#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_maxCLevel */
-#include "zstd.h"     /* ZSTD_VERSION_STRING */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_minCLevel */
+#include "zstd.h"     /* ZSTD_VERSION_STRING, ZSTD_maxCLevel */
 
 
 /*-************************************
@@ -85,6 +85,10 @@ static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT;
 static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;
 
 
+#define DEFAULT_ACCEL 1
+
+typedef enum { cover, fastCover, legacy } dictType;
+
 /*-************************************
 *  Display Macros
 **************************************/
@@ -135,6 +139,7 @@ static int usage_advanced(const char* programName)
     DISPLAY( "--ultra : enable levels beyond %i, up to %i (requires more memory)\n", ZSTDCLI_CLEVEL_MAX, ZSTD_maxCLevel());
     DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog);
     DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1);
+    DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n");
 #ifdef ZSTD_MULTITHREAD
     DISPLAY( " -T#    : spawns # compression threads (default: 1, 0==# cores) \n");
     DISPLAY( " -B#    : select size of each job (default: 0==automatic) \n");
@@ -145,6 +150,7 @@ static int usage_advanced(const char* programName)
 #ifdef UTIL_HAS_CREATEFILELIST
     DISPLAY( " -r     : operate recursively on directories \n");
 #endif
+    DISPLAY( "--format=zstd : compress files to the .zstd format (default) \n");
 #ifdef ZSTD_GZCOMPRESS
     DISPLAY( "--format=gzip : compress files to the .gz format \n");
 #endif
@@ -169,7 +175,8 @@ static int usage_advanced(const char* programName)
     DISPLAY( "\n");
     DISPLAY( "Dictionary builder : \n");
     DISPLAY( "--train ## : create a dictionary from a training set of files \n");
-    DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
     DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
     DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
     DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@@ -219,20 +226,34 @@ static int exeNameMatch(const char* exeName, const char* test)
         (exeName[strlen(test)] == '\0' || exeName[strlen(test)] == '.');
 }
 
+static void errorOut(const char* msg)
+{
+    DISPLAY("%s \n", msg); exit(1);
+}
+
 /*! readU32FromChar() :
  * @return : unsigned integer value read from input in `char` format.
  *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
  *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
- *  Note : function result can overflow if digit string > MAX_UINT */
+ *  Note : function will exit() program if digit sequence overflows */
 static unsigned readU32FromChar(const char** stringPtr)
 {
+    const char errorMsg[] = "error: numeric value too large";
     unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9'))
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) errorOut(errorMsg);
         result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
     if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) errorOut(errorMsg);
         result <<= 10;
-        if (**stringPtr=='M') result <<= 10;
-        (*stringPtr)++ ;
+        if (**stringPtr=='M') {
+            if (result > maxK) errorOut(errorMsg);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
         if (**stringPtr=='i') (*stringPtr)++;
         if (**stringPtr=='B') (*stringPtr)++;
     }
@@ -267,10 +288,42 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
         if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
         if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
         if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          unsigned splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
         return 0;
     }
     if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100));
+    return 1;
+}
+
+/**
+ * parseFastCoverParameters() :
+ * reads fastcover parameters from *stringPtr (e.g. "--train-fastcover=k=48,d=8,f=20,steps=32,accel=2") into *params
+ * @return 1 means that fastcover parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_params_t* params)
+{
+    memset(params, 0, sizeof(*params));
+    for (; ;) {
+        if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "f=")) { params->f = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "accel=")) { params->accel = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "split=")) {
+          unsigned splitPercentage = readU32FromChar(&stringPtr);
+          params->splitPoint = (double)splitPercentage / 100.0;
+          if (stringPtr[0]==',') { stringPtr++; continue; } else break;
+        }
+        return 0;
+    }
+    if (stringPtr[0] != 0) return 0;
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
     return 1;
 }
 
@@ -295,11 +348,48 @@ static ZDICT_cover_params_t defaultCoverParams(void)
     memset(&params, 0, sizeof(params));
     params.d = 8;
     params.steps = 4;
+    params.splitPoint = 1.0;
+    return params;
+}
+
+static ZDICT_fastCover_params_t defaultFastCoverParams(void)
+{
+    ZDICT_fastCover_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.f = 20;
+    params.steps = 4;
+    params.splitPoint = 0.75; /* different from default splitPoint of cover */
+    params.accel = DEFAULT_ACCEL;
     return params;
 }
 #endif
 
 
+/** parseAdaptParameters() :
+ *  reads adapt parameters from *stringPtr (e.g. "--zstd=min=1,max=19) and store them into adaptMinPtr and adaptMaxPtr.
+ *  Both adaptMinPtr and adaptMaxPtr must be already allocated and correctly initialized.
+ *  There is no guarantee that any of these values will be updated.
+ *  @return 1 means that parsing was successful,
+ *  @return 0 in case of malformed parameters
+ */
+static unsigned parseAdaptParameters(const char* stringPtr, int* adaptMinPtr, int* adaptMaxPtr)
+{
+    for ( ; ;) {
+        if (longCommandWArg(&stringPtr, "min=")) { *adaptMinPtr = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        if (longCommandWArg(&stringPtr, "max=")) { *adaptMaxPtr = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        DISPLAYLEVEL(4, "invalid compression parameter \n");
+        return 0;
+    }
+    if (stringPtr[0] != 0) return 0; /* check the end of string */
+    if (*adaptMinPtr > *adaptMaxPtr) {
+        DISPLAYLEVEL(4, "incoherent adaptation limits \n");
+        return 0;
+    }
+    return 1;
+}
+
+
 /** parseCompressionParameters() :
  *  reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params
  *  @return 1 means that compression parameters were correct
@@ -364,6 +454,15 @@ typedef enum { zom_compress, zom_decompress, zom_test, zom_bench, zom_train, zom
 
 #define CLEAN_RETURN(i) { operationResult = (i); goto _end; }
 
+#ifdef ZSTD_NOCOMPRESS
+/* symbols from compression library are not defined and should not be invoked */
+# define MINCLEVEL  -50
+# define MAXCLEVEL   22
+#else
+# define MINCLEVEL  ZSTD_minCLevel()
+# define MAXCLEVEL  ZSTD_maxCLevel()
+#endif
+
 int main(int argCount, const char* argv[])
 {
     int argNb,
@@ -373,6 +472,9 @@ int main(int argCount, const char* argv[])
         ldmFlag = 0,
         main_pause = 0,
         nbWorkers = 0,
+        adapt = 0,
+        adaptMin = MINCLEVEL,
+        adaptMax = MAXCLEVEL,
         nextArgumentIsOutFileName = 0,
         nextArgumentIsMaxDict = 0,
         nextArgumentIsDictID = 0,
@@ -383,6 +485,7 @@ int main(int argCount, const char* argv[])
         setRealTimePrio = 0,
         singleThread = 0,
         ultra=0;
+    double compressibility = 0.5;
     unsigned bench_nbSeconds = 3;   /* would be better if this value was synchronized from bench */
     size_t blockSize = 0;
     zstd_operation_mode operation = zom_compress;
@@ -408,14 +511,16 @@ int main(int argCount, const char* argv[])
 #endif
 #ifndef ZSTD_NODICT
     ZDICT_cover_params_t coverParams = defaultCoverParams();
-    int cover = 1;
+    ZDICT_fastCover_params_t fastCoverParams = defaultFastCoverParams();
+    dictType dict = fastCover;
+#endif
+#ifndef ZSTD_NOBENCH
+    BMK_advancedParams_t benchParams = BMK_initAdvancedParams();
 #endif
 
 
     /* init */
     (void)recursive; (void)cLevelLast;    /* not used when ZSTD_NOBENCH set */
-    (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
-    (void)ultra; (void)cLevel; (void)ldmFlag; /* not used when ZSTD_NOCOMPRESS set */
     (void)memLimit;   /* not used when ZSTD_NODECOMPRESS set */
     if (filenameTable==NULL) { DISPLAY("zstd: %s \n", strerror(errno)); exit(1); }
     filenameTable[0] = stdinmark;
@@ -426,7 +531,7 @@ int main(int argCount, const char* argv[])
 #endif
 
     /* preset behaviors */
-    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbWorkers=0;
+    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbWorkers=0, singleThread=0;
     if (exeNameMatch(programName, ZSTD_UNZSTD)) operation=zom_decompress;
     if (exeNameMatch(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }   /* supports multiple formats */
     if (exeNameMatch(programName, ZSTD_ZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like zcat, also supports multiple formats */
@@ -441,6 +546,9 @@ int main(int argCount, const char* argv[])
     if (exeNameMatch(programName, ZSTD_UNLZ4)) { operation=zom_decompress; FIO_setCompressionType(FIO_lz4Compression); }                                   /* behave like unlz4, also supports multiple formats */
     memset(&compressionParams, 0, sizeof(compressionParams));
 
+    /* init crash handler */
+    FIO_addAbortHandler();
+
     /* command switches */
     for (argNb=1; argNb<argCount; argNb++) {
         const char* argument = argv[argNb];
@@ -478,14 +586,17 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; }
                     if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; }
                     if (!strcmp(argument, "--test")) { operation=zom_test; continue; }
-                    if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; }
+                    if (!strcmp(argument, "--train")) { operation=zom_train; if (outFileName==NULL) outFileName=g_defaultDictName; continue; }
                     if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; }
                     if (!strcmp(argument, "--keep")) { FIO_setRemoveSrcFile(0); continue; }
                     if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
                     if (!strcmp(argument, "--priority=rt")) { setRealTimePrio = 1; continue; }
+                    if (!strcmp(argument, "--adapt")) { adapt = 1; continue; }
+                    if (longCommandWArg(&argument, "--adapt=")) { adapt = 1; if (!parseAdaptParameters(argument, &adaptMin, &adaptMax)) CLEAN_RETURN(badusage(programName)); continue; }
                     if (!strcmp(argument, "--single-thread")) { nbWorkers = 0; singleThread = 1; continue; }
+                    if (!strcmp(argument, "--format=zstd")) { suffix = ZSTD_EXTENSION; FIO_setCompressionType(FIO_zstdCompression); continue; }
 #ifdef ZSTD_GZCOMPRESS
                     if (!strcmp(argument, "--format=gzip")) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); continue; }
 #endif
@@ -501,18 +612,31 @@ int main(int argCount, const char* argv[])
 #ifndef ZSTD_NODICT
                     if (longCommandWArg(&argument, "--train-cover")) {
                       operation = zom_train;
-                      outFileName = g_defaultDictName;
-                      cover = 1;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = cover;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
                       else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
                       continue;
                     }
+                    if (longCommandWArg(&argument, "--train-fastcover")) {
+                      operation = zom_train;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = fastCover;
+                      /* Allow optional arguments following an = */
+                      if (*argument == 0) { memset(&fastCoverParams, 0, sizeof(fastCoverParams)); }
+                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+                      else if (!parseFastCoverParameters(argument, &fastCoverParams)) { CLEAN_RETURN(badusage(programName)); }
+                      continue;
+                    }
                     if (longCommandWArg(&argument, "--train-legacy")) {
                       operation = zom_train;
-                      outFileName = g_defaultDictName;
-                      cover = 0;
+                      if (outFileName == NULL)
+                          outFileName = g_defaultDictName;
+                      dict = legacy;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { continue; }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
@@ -544,13 +668,20 @@ int main(int argCount, const char* argv[])
                             compressionParams.windowLog = ldmWindowLog;
                         continue;
                     }
+#ifndef ZSTD_NOCOMPRESS   /* linking ZSTD_minCLevel() requires compression support */
                     if (longCommandWArg(&argument, "--fast")) {
-                        /* Parse optional window log */
+                        /* Parse optional acceleration factor */
                         if (*argument == '=') {
+                            U32 const maxFast = (U32)-ZSTD_minCLevel();
                             U32 fastLevel;
                             ++argument;
                             fastLevel = readU32FromChar(&argument);
-                            if (fastLevel) cLevel = - (int)fastLevel;
+                            if (fastLevel > maxFast) fastLevel = maxFast;
+                            if (fastLevel) {
+                              dictCLevel = cLevel = -(int)fastLevel;
+                            } else {
+                              CLEAN_RETURN(badusage(programName));
+                            }
                         } else if (*argument != 0) {
                             /* Invalid character following --fast */
                             CLEAN_RETURN(badusage(programName));
@@ -559,6 +690,7 @@ int main(int argCount, const char* argv[])
                         }
                         continue;
                     }
+#endif
                     /* fall-through, will trigger bad_usage() later on */
                 }
 
@@ -589,7 +721,7 @@ int main(int argCount, const char* argv[])
                          /* Decoding */
                     case 'd':
 #ifndef ZSTD_NOBENCH
-                            BMK_setDecodeOnlyMode(1);
+                            benchParams.mode = BMK_decodeOnly;
                             if (operation==zom_bench) { argument++; break; }  /* benchmark decode (hidden option) */
 #endif
                             operation=zom_decompress; argument++; break;
@@ -682,11 +814,19 @@ int main(int argCount, const char* argv[])
                     case 'p': argument++;
 #ifndef ZSTD_NOBENCH
                         if ((*argument>='0') && (*argument<='9')) {
-                            BMK_setAdditionalParam(readU32FromChar(&argument));
+                            benchParams.additionalParam = (int)readU32FromChar(&argument);
                         } else
 #endif
                             main_pause=1;
                         break;
+
+                        /* Select compressibility of synthetic sample */
+                    case 'P':
+                    {   argument++;
+                        compressibility = (double)readU32FromChar(&argument) / 100;
+                    }
+                    break;
+
                         /* unknown command */
                     default : CLEAN_RETURN(badusage(programName));
                     }
@@ -743,8 +883,11 @@ int main(int argCount, const char* argv[])
         nbWorkers = UTIL_countPhysicalCores();
         DISPLAYLEVEL(3, "Note: %d physical core(s) detected \n", nbWorkers);
     }
+#else
+    (void)singleThread; (void)nbWorkers;
 #endif
 
+#ifdef UTIL_HAS_CREATEFILELIST
     g_utilDisplayLevel = g_displayLevel;
     if (!followLinks) {
         unsigned u;
@@ -757,7 +900,6 @@ int main(int argCount, const char* argv[])
         }
         filenameIdx = fileNamesNb;
     }
-#ifdef UTIL_HAS_CREATEFILELIST
     if (recursive) {  /* at this stage, filenameTable is a list of paths, which can contain both files and directories */
         extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
         if (extendedFileList) {
@@ -768,6 +910,8 @@ int main(int argCount, const char* argv[])
             filenameIdx = fileNamesNb;
         }
     }
+#else
+    (void)followLinks;
 #endif
 
     if (operation == zom_list) {
@@ -783,24 +927,48 @@ int main(int argCount, const char* argv[])
     /* Check if benchmark is selected */
     if (operation==zom_bench) {
 #ifndef ZSTD_NOBENCH
-        BMK_setNotificationLevel(g_displayLevel);
-        BMK_setSeparateFiles(separateFiles);
-        BMK_setBlockSize(blockSize);
-        BMK_setNbWorkers(nbWorkers);
-        BMK_setRealTime(setRealTimePrio);
-        BMK_setNbSeconds(bench_nbSeconds);
-        BMK_setLdmFlag(ldmFlag);
-        BMK_setLdmMinMatch(g_ldmMinMatch);
-        BMK_setLdmHashLog(g_ldmHashLog);
+        benchParams.blockSize = blockSize;
+        benchParams.nbWorkers = nbWorkers;
+        benchParams.realTime = setRealTimePrio;
+        benchParams.nbSeconds = bench_nbSeconds;
+        benchParams.ldmFlag = ldmFlag;
+        benchParams.ldmMinMatch = g_ldmMinMatch;
+        benchParams.ldmHashLog = g_ldmHashLog;
         if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) {
-            BMK_setLdmBucketSizeLog(g_ldmBucketSizeLog);
+            benchParams.ldmBucketSizeLog = g_ldmBucketSizeLog;
         }
         if (g_ldmHashEveryLog != LDM_PARAM_DEFAULT) {
-            BMK_setLdmHashEveryLog(g_ldmHashEveryLog);
+            benchParams.ldmHashEveryLog = g_ldmHashEveryLog;
         }
-        BMK_benchFiles(filenameTable, filenameIdx, dictFileName, cLevel, cLevelLast, &compressionParams);
+
+        if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
+        if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
+        if (cLevelLast < cLevel) cLevelLast = cLevel;
+        if (cLevelLast > cLevel)
+            DISPLAYLEVEL(3, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
+        if(filenameIdx) {
+            if(separateFiles) {
+                unsigned i;
+                for(i = 0; i < filenameIdx; i++) {
+                    int c;
+                    DISPLAYLEVEL(3, "Benchmarking %s \n", filenameTable[i]);
+                    for(c = cLevel; c <= cLevelLast; c++) {
+                        BMK_benchFilesAdvanced(&filenameTable[i], 1, dictFileName, c, &compressionParams, g_displayLevel, &benchParams);
+                    }
+                }
+            } else {
+                for(; cLevel <= cLevelLast; cLevel++) {
+                    BMK_benchFilesAdvanced(filenameTable, filenameIdx, dictFileName, cLevel, &compressionParams, g_displayLevel, &benchParams);
+                }
+            }
+        } else {
+            for(; cLevel <= cLevelLast; cLevel++) {
+                BMK_syntheticTest(cLevel, compressibility, &compressionParams, g_displayLevel, &benchParams);
+            }
+        }
+
 #else
-        (void)bench_nbSeconds; (void)blockSize; (void)setRealTimePrio; (void)separateFiles;
+        (void)bench_nbSeconds; (void)blockSize; (void)setRealTimePrio; (void)separateFiles; (void)compressibility;
 #endif
         goto _end;
     }
@@ -812,18 +980,27 @@ int main(int argCount, const char* argv[])
         zParams.compressionLevel = dictCLevel;
         zParams.notificationLevel = g_displayLevel;
         zParams.dictID = dictID;
-        if (cover) {
+        if (dict == cover) {
             int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = nbWorkers;
             coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, NULL, optimize);
+        } else if (dict == fastCover) {
+            int const optimize = !fastCoverParams.k || !fastCoverParams.d;
+            fastCoverParams.nbThreads = nbWorkers;
+            fastCoverParams.zParams = zParams;
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, NULL, &fastCoverParams, optimize);
         } else {
             ZDICT_legacy_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
             dictParams.selectivityLevel = dictSelect;
             dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, &dictParams, NULL, NULL, 0);
         }
+#else
+        (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
+        DISPLAYLEVEL(1, "training mode not available \n");
+        operationResult = 1;
 #endif
         goto _end;
     }
@@ -866,24 +1043,25 @@ int main(int argCount, const char* argv[])
 #ifndef ZSTD_NOCOMPRESS
         FIO_setNbWorkers(nbWorkers);
         FIO_setBlockSize((U32)blockSize);
+        if (g_overlapLog!=OVERLAP_LOG_DEFAULT) FIO_setOverlapLog(g_overlapLog);
         FIO_setLdmFlag(ldmFlag);
         FIO_setLdmHashLog(g_ldmHashLog);
         FIO_setLdmMinMatch(g_ldmMinMatch);
-        if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) {
-            FIO_setLdmBucketSizeLog(g_ldmBucketSizeLog);
-        }
-        if (g_ldmHashEveryLog != LDM_PARAM_DEFAULT) {
-            FIO_setLdmHashEveryLog(g_ldmHashEveryLog);
-        }
+        if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) FIO_setLdmBucketSizeLog(g_ldmBucketSizeLog);
+        if (g_ldmHashEveryLog != LDM_PARAM_DEFAULT) FIO_setLdmHashEveryLog(g_ldmHashEveryLog);
+        FIO_setAdaptiveMode(adapt);
+        FIO_setAdaptMin(adaptMin);
+        FIO_setAdaptMax(adaptMax);
+        if (adaptMin > cLevel) cLevel = adaptMin;
+        if (adaptMax < cLevel) cLevel = adaptMax;
 
-        if (g_overlapLog!=OVERLAP_LOG_DEFAULT) FIO_setOverlapLog(g_overlapLog);
         if ((filenameIdx==1) && outFileName)
-          operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel, &compressionParams);
+          operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel, compressionParams);
         else
-          operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, &compressionParams);
+          operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams);
 #else
-        (void)suffix;
-        DISPLAY("Compression not supported\n");
+        (void)suffix; (void)adapt; (void)ultra; (void)cLevel; (void)ldmFlag; /* not used when ZSTD_NOCOMPRESS set */
+        DISPLAY("Compression not supported \n");
 #endif
     } else {  /* decompression or test */
 #ifndef ZSTD_NODECOMPRESS
@@ -900,7 +1078,7 @@ int main(int argCount, const char* argv[])
         else
             operationResult = FIO_decompressMultipleFilenames(filenameTable, filenameIdx, outFileName, dictFileName);
 #else
-        DISPLAY("Decompression not supported\n");
+        DISPLAY("Decompression not supported \n");
 #endif
     }
 
diff --git a/programs/zstdgrep.1 b/programs/zstdgrep.1
new file mode 100644
index 000000000000..716d28fc8e7e
--- /dev/null
+++ b/programs/zstdgrep.1
@@ -0,0 +1,23 @@
+.
+.TH "ZSTDGREP" "1" "October 2018" "zstd 1.3.7" "User Commands"
+.
+.SH "NAME"
+\fBzstdgrep\fR \- print lines matching a pattern in zstandard\-compressed files
+.
+.SH "SYNOPSIS"
+\fBzstdgrep\fR [\fIgrep\-flags\fR] [\-\-] \fIpattern\fR [\fIfiles\fR \.\.\.]
+.
+.SH "DESCRIPTION"
+\fBzstdgrep\fR runs \fBgrep (1)\fR on files or stdin, if no files argument is given, after decompressing them with \fBzstdcat (1)\fR\.
+.
+.P
+The grep\-flags and pattern arguments are passed on to \fBgrep (1)\fR\. If an \fB\-e\fR flag is found in the \fBgrep\-flags\fR, \fBzstdgrep\fR will not look for a pattern argument\.
+.
+.SH "EXIT STATUS"
+In case of missing arguments or missing pattern, 1 will be returned, otherwise 0\.
+.
+.SH "SEE ALSO"
+\fBzstd (1)\fR
+.
+.SH "AUTHORS"
+Thomas Klausner \fIwiz@NetBSD\.org\fR
diff --git a/programs/zstdgrep.1.md b/programs/zstdgrep.1.md
new file mode 100644
index 000000000000..363ad4f9978c
--- /dev/null
+++ b/programs/zstdgrep.1.md
@@ -0,0 +1,26 @@
+zstdgrep(1) -- print lines matching a pattern in zstandard-compressed files
+============================================================================
+
+SYNOPSIS
+--------
+
+`zstdgrep` [*grep-flags*] [--] _pattern_ [_files_ ...]
+
+
+DESCRIPTION
+-----------
+`zstdgrep` runs `grep (1)` on files or stdin, if no files argument is given, after decompressing them with `zstdcat (1)`.
+
+The grep-flags and pattern arguments are passed on to `grep (1)`.  If an `-e` flag is found in the `grep-flags`, `zstdgrep` will not look for a pattern argument.
+
+EXIT STATUS
+-----------
+In case of missing arguments or missing pattern, 1 will be returned, otherwise 0.
+
+SEE ALSO
+--------
+`zstd (1)`
+
+AUTHORS
+-------
+Thomas Klausner <wiz@NetBSD.org>
diff --git a/programs/zstdless.1 b/programs/zstdless.1
new file mode 100644
index 000000000000..bf4965e7dfa6
--- /dev/null
+++ b/programs/zstdless.1
@@ -0,0 +1,14 @@
+.
+.TH "ZSTDLESS" "1" "October 2018" "zstd 1.3.7" "User Commands"
+.
+.SH "NAME"
+\fBzstdless\fR \- view zstandard\-compressed files
+.
+.SH "SYNOPSIS"
+\fBzstdless\fR [\fIflags\fR] [\fIfile\fR \.\.\.]
+.
+.SH "DESCRIPTION"
+\fBzstdless\fR runs \fBless (1)\fR on files or stdin, if no files argument is given, after decompressing them with \fBzstdcat (1)\fR\.
+.
+.SH "SEE ALSO"
+\fBzstd (1)\fR
diff --git a/programs/zstdless.1.md b/programs/zstdless.1.md
new file mode 100644
index 000000000000..d91d48abcc71
--- /dev/null
+++ b/programs/zstdless.1.md
@@ -0,0 +1,16 @@
+zstdless(1) -- view zstandard-compressed files
+============================================================================
+
+SYNOPSIS
+--------
+
+`zstdless` [*flags*] [_file_ ...]
+
+
+DESCRIPTION
+-----------
+`zstdless` runs `less (1)` on files or stdin, if no files argument is given, after decompressing them with `zstdcat (1)`.
+
+SEE ALSO
+--------
+`zstd (1)`
diff --git a/tests/.gitignore b/tests/.gitignore
index 4911b2d62a56..1f08c3995e85 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,6 +1,7 @@
 # local binary (Makefile)
 fullbench
 fullbench32
+fullbench-lib
 fuzzer
 fuzzer32
 fuzzer-dll
@@ -26,6 +27,7 @@ invalidDictionaries
 checkTag
 zcat
 zstdcat
+tm
 
 # Tmp test directory
 zstdtest
diff --git a/tests/Makefile b/tests/Makefile
index 5b35ad406318..2a96829f61c5 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -24,15 +24,18 @@ PYTHON ?= python3
 TESTARTEFACT := versionsTest
 
 DEBUGLEVEL ?= 1
-DEBUGFLAGS  = -g -DZSTD_DEBUG=$(DEBUGLEVEL)
+DEBUGFLAGS  = -g -DDEBUGLEVEL=$(DEBUGLEVEL)
 CPPFLAGS   += -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
               -I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(PRGDIR)
+ifeq ($(OS),Windows_NT)   # MinGW assumed
+CPPFLAGS   += -D__USE_MINGW_ANSI_STDIO   # compatibility with %zu formatting
+endif
 CFLAGS     ?= -O3
 CFLAGS     += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow                 \
               -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
               -Wstrict-prototypes -Wundef -Wformat-security                   \
               -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings      \
-              -Wredundant-decls
+              -Wredundant-decls -Wmissing-prototypes
 CFLAGS     += $(DEBUGFLAGS) $(MOREFLAGS)
 FLAGS       = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
 
@@ -78,7 +81,8 @@ DECODECORPUS_TESTTIME ?= -T30
 default: fullbench
 	@echo $(ZSTDMT_OBJECTS)
 
-all: fullbench fuzzer zstreamtest paramgrill datagen decodecorpus roundTripCrash
+all: fullbench fuzzer zstreamtest paramgrill datagen decodecorpus roundTripCrash \
+     fullbench-lib
 
 all32: fullbench32 fuzzer32 zstreamtest32
 
@@ -88,13 +92,8 @@ allnothread: fullbench fuzzer paramgrill datagen decodecorpus
 
 dll: fuzzer-dll zstreamtest-dll
 
-zstd:
-	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"
-
-zstd32:
-	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"
-
-zstd-nolegacy:
+PHONY: zstd zstd32 zstd-nolegacy  # must be phony, only external makefile knows how to build them, or if they need an update
+zstd zstd32 zstd-nolegacy:
 	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"
 
 gzstd:
@@ -131,13 +130,14 @@ zstdmt_d_%.o : $(ZSTDDIR)/decompress/%.c
 fullbench32: CPPFLAGS += -m32
 fullbench fullbench32 : CPPFLAGS += $(MULTITHREAD_CPP)
 fullbench fullbench32 : LDFLAGS += $(MULTITHREAD_LD)
-fullbench fullbench32 : DEBUGFLAGS =   # turn off assert() for speed measurements
+fullbench fullbench32 : DEBUGFLAGS = -DNDEBUG  # turn off assert() for speed measurements
 fullbench fullbench32 : $(ZSTD_FILES)
-fullbench fullbench32 : $(PRGDIR)/datagen.c fullbench.c
+fullbench fullbench32 : $(PRGDIR)/datagen.c $(PRGDIR)/bench.c fullbench.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)
 
+fullbench-lib : CPPFLAGS += -DXXH_NAMESPACE=ZSTD_
 fullbench-lib : zstd-staticLib
-fullbench-lib : $(PRGDIR)/datagen.c fullbench.c
+fullbench-lib : $(PRGDIR)/datagen.c $(PRGDIR)/bench.c fullbench.c
 	$(CC) $(FLAGS) $(filter %.c,$^) -o $@$(EXT) $(ZSTDDIR)/libzstd.a
 
 # note : broken : requires unavailable symbols
@@ -202,8 +202,8 @@ zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c  # xxh symbols not exposed from dll
 zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)
 
-paramgrill : DEBUGFLAGS =   # turn off assert() for speed measurements
-paramgrill : $(ZSTD_FILES) $(PRGDIR)/datagen.c paramgrill.c
+paramgrill : DEBUGFLAGS =  # turn off assert() by default for speed measurements
+paramgrill : $(ZSTD_FILES) $(PRGDIR)/bench.c $(PRGDIR)/datagen.c paramgrill.c
 	$(CC) $(FLAGS) $^ -lm -o $@$(EXT)
 
 datagen : $(PRGDIR)/datagen.c datagencli.c
@@ -245,13 +245,14 @@ checkTag: checkTag.c $(ZSTDDIR)/zstd.h
 
 clean:
 	$(MAKE) -C $(ZSTDDIR) clean
+	$(MAKE) -C $(PRGDIR) clean
 	@$(RM) -fR $(TESTARTEFACT)
 	@$(RM) -f core *.o tmp* result* *.gcda dictionary *.zst \
         $(PRGDIR)/zstd$(EXT) $(PRGDIR)/zstd32$(EXT) \
         fullbench$(EXT) fullbench32$(EXT) \
         fullbench-lib$(EXT) fullbench-dll$(EXT) \
         fuzzer$(EXT) fuzzer32$(EXT) zbufftest$(EXT) zbufftest32$(EXT) \
-        fuzzer-dll$(EXT) zstreamtest-dll$(EXT) zbufftest-dll$(EXT)\
+        fuzzer-dll$(EXT) zstreamtest-dll$(EXT) zbufftest-dll$(EXT) \
         zstreamtest$(EXT) zstreamtest32$(EXT) \
         datagen$(EXT) paramgrill$(EXT) roundTripCrash$(EXT) longmatch$(EXT) \
         symbols$(EXT) invalidDictionaries$(EXT) legacy$(EXT) poolTests$(EXT) \
@@ -260,7 +261,7 @@ clean:
 
 
 #----------------------------------------------------------------------------------
-#make valgrindTest is validated only for Linux, OSX, BSD, Hurd and Solaris targets
+#make valgrindTest is validated only for Linux, macOS, BSD, Hurd and Solaris targets
 #----------------------------------------------------------------------------------
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
 HOST_OS = POSIX
@@ -301,11 +302,6 @@ endif
 list:
 	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
 
-.PHONY: zstd-playTests
-zstd-playTests: datagen
-	file $(ZSTD)
-	ZSTD="$(QEMU_SYS) $(ZSTD)" ./playTests.sh $(ZSTDRTTEST)
-
 .PHONY: shortest
 shortest: ZSTDRTTEST=
 shortest: test-zstd
@@ -323,14 +319,21 @@ test32: test-zstd32 test-fullbench32 test-fuzzer32 test-zstream32
 
 test-all: test test32 valgrindTest test-decodecorpus-cli
 
+
+.PHONY: test-zstd test-zstd32 test-zstd-nolegacy
 test-zstd: ZSTD = $(PRGDIR)/zstd
-test-zstd: zstd zstd-playTests
+test-zstd: zstd
 
 test-zstd32: ZSTD = $(PRGDIR)/zstd32
-test-zstd32: zstd32 zstd-playTests
+test-zstd32: zstd32
 
 test-zstd-nolegacy: ZSTD = $(PRGDIR)/zstd-nolegacy
-test-zstd-nolegacy: zstd-nolegacy zstd-playTests
+test-zstd-nolegacy: zstd-nolegacy
+
+test-zstd test-zstd32 test-zstd-nolegacy: datagen
+	file $(ZSTD)
+	ZSTD="$(QEMU_SYS) $(ZSTD)" ./playTests.sh $(ZSTDRTTEST)
+
 
 test-gzstd: gzstd
 	$(PRGDIR)/zstd -f README.md test-zstd-speed.py
@@ -360,6 +363,9 @@ test-fullbench32: fullbench32 datagen
 test-fuzzer: fuzzer
 	$(QEMU_SYS) ./fuzzer -v $(FUZZERTEST) $(FUZZER_FLAGS)
 
+test-fuzzer-stackmode: MOREFLAGS += -DZSTD_HEAPMODE=0
+test-fuzzer-stackmode: test-fuzzer
+
 test-fuzzer32: fuzzer32
 	$(QEMU_SYS) ./fuzzer32 -v $(FUZZERTEST) $(FUZZER_FLAGS)
 
@@ -373,7 +379,6 @@ test-zstream: zstreamtest
 	$(QEMU_SYS) ./zstreamtest -v $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
 	$(QEMU_SYS) ./zstreamtest --mt -t1 $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
 	$(QEMU_SYS) ./zstreamtest --newapi -t1 $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
-	$(QEMU_SYS) ./zstreamtest --opaqueapi -t1 $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
 
 test-zstream32: zstreamtest32
 	$(QEMU_SYS) ./zstreamtest32 $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
diff --git a/tests/README.md b/tests/README.md
index 24a28ab7b9d0..f28766bd1942 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -88,3 +88,56 @@ as well as the 10,000 original files for more detailed comparison of decompressi
 will choose a random seed, and for 1 minute,
 generate random test frames and ensure that the
 zstd library correctly decompresses them in both simple and streaming modes.
+
+#### `paramgrill` - tool for generating compression table parameters and optimizing parameters on file given constraints
+
+Full list of arguments
+```
+ -T#          : set level 1 speed objective
+ -B#          : cut input into blocks of size # (default : single block)
+ -S           : benchmarks a single run (example command: -Sl3w10h12)
+    w# - windowLog
+    h# - hashLog
+    c# - chainLog
+    s# - searchLog
+    l# - searchLength
+    t# - targetLength
+    S# - strategy
+    L# - level
+ --zstd=      : Single run, parameter selection syntax same as zstdcli with more parameters
+                    (Added forceAttachDictionary / fadt) 
+                    When invoked with --optimize, this represents the sample to exceed. 
+ --optimize=  : find parameters to maximize compression ratio given parameters
+                    Can use all --zstd= commands to constrain the type of solution found in addition to the following constraints
+    cSpeed=   : Minimum compression speed
+    dSpeed=   : Minimum decompression speed
+    cMem=     : Maximum compression memory
+    lvl=      : Searches for solutions which are strictly better than that compression lvl in ratio and cSpeed, 
+    stc=      : When invoked with lvl=, represents percentage slack in ratio/cSpeed allowed for a solution to be considered (Default 100%)
+              : In normal operation, represents percentage slack in choosing viable starting strategy selection in choosing the default parameters
+                    (Lower value will begin with stronger strategies) (Default 90%)
+    speedRatio=   (accepts decimals)
+              : determines value of gains in speed vs gains in ratio
+                    when determining overall winner (default 5 (1% ratio = 5% speed)).
+    tries=    : Maximum number of random restarts on a single strategy before switching (Default 5)
+                    Higher values will make optimizer run longer, more chances to find better solution.
+    memLog    : Limits the log of the size of each memotable (1 per strategy). Will use hash tables when state space is larger than max size. 
+                    Setting memLog = 0 turns off memoization 
+ --display=   : specifiy which parameters are included in the output
+                    can use all --zstd parameter names and 'cParams' as a shorthand for all parameters used in ZSTD_compressionParameters 
+                    (Default: display all params available)
+ -P#          : generated sample compressibility (when no file is provided)
+ -t#          : Caps runtime of operation in seconds (default : 99999 seconds (about 27 hours )) 
+ -v           : Prints Benchmarking output
+ -D           : Next argument dictionary file
+ -s           : Benchmark all files separately
+ -q           : Quiet, repeat for more quiet
+                  -q Prints parameters + results whenever a new best is found
+                  -qq Only prints parameters whenever a new best is found, prints final parameters + results
+                  -qqq Only print final parameters + results
+                  -qqqq Only prints final parameter set in the form --zstd=
+ -v           : Verbose, cancels quiet, repeat for more volume
+                  -v Prints all candidate parameters and results
+
+```
+ Any inputs afterwards are treated as files to benchmark.
diff --git a/tests/decodecorpus.c b/tests/decodecorpus.c
index 407653119dd2..2c2276004a95 100644
--- a/tests/decodecorpus.c
+++ b/tests/decodecorpus.c
@@ -437,7 +437,8 @@ static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t ds
     U32 count[HUF_SYMBOLVALUE_MAX+1];
 
     /* Scan input and build symbol stats */
-    {   size_t const largest = FSE_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP);
+    {   size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP);
+        assert(!HIST_isError(largest));
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; }   /* single symbol, rle */
         if (largest <= (srcSize >> 7)+1) return 0;   /* Fast heuristic : not compressible enough */
     }
@@ -619,6 +620,8 @@ static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
 }
 
 static inline void initSeqStore(seqStore_t *seqStore) {
+    seqStore->maxNbSeq = MAX_NB_SEQ;
+    seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
     seqStore->sequencesStart = SEQUENCE_BUFFER;
     seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
     seqStore->llCode = SEQUENCE_LLCODE;
@@ -834,7 +837,8 @@ static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
 
     /* CTable for Literal Lengths */
     {   U32 max = MaxLL;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP);   /* cannot fail */
+        assert(!HIST_isError(mostFrequent));
         if (mostFrequent == nbSeq) {
             /* do RLE if we have the chance */
             *op++ = llCodeTable[0];
@@ -865,7 +869,8 @@ static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
     /* CTable for Offsets */
     /* see Literal Lengths for descriptions of mode choices */
     {   U32 max = MaxOff;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP);   /* cannot fail */
+        assert(!HIST_isError(mostFrequent));
         if (mostFrequent == nbSeq) {
             *op++ = ofCodeTable[0];
             FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
@@ -892,7 +897,8 @@ static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
     /* CTable for MatchLengths */
     /* see Literal Lengths for descriptions of mode choices */
     {   U32 max = MaxML;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP);
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP);   /* cannot fail */
+        assert(!HIST_isError(mostFrequent));
         if (mostFrequent == nbSeq) {
             *op++ = *mlCodeTable;
             FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
diff --git a/tests/fullbench.c b/tests/fullbench.c
index 6abdd4da00fa..b05f1537cd70 100644
--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@@ -30,6 +30,7 @@
 #include "zstd.h"        /* ZSTD_versionString */
 #include "util.h"        /* time functions */
 #include "datagen.h"
+#include "bench.h"       /* CustomBench*/
 
 
 /*_************************************
@@ -45,9 +46,13 @@
 #define KNUTH      2654435761U
 #define MAX_MEM    (1984 MB)
 
+#define DEFAULT_CLEVEL 1
+
 #define COMPRESSIBILITY_DEFAULT 0.50
 static const size_t g_sampleSize = 10000000;
 
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+
 
 /*_************************************
 *  Macros
@@ -93,14 +98,26 @@ static size_t BMK_findMaxMem(U64 requiredMem)
 /*_*******************************************************
 *  Benchmark wrappers
 *********************************************************/
-size_t local_ZSTD_compress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+
+static ZSTD_CCtx* g_zcc = NULL;
+
+static size_t
+local_ZSTD_compress(const void* src, size_t srcSize,
+                    void* dst, size_t dstSize,
+                    void* buff2)
 {
-    (void)buff2;
-    return ZSTD_compress(dst, dstSize, src, srcSize, 1);
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    return ZSTD_compress_advanced (g_zcc, dst, dstSize, src, srcSize, NULL ,0, p);
+    //return ZSTD_compress(dst, dstSize, src, srcSize, cLevel);
 }
 
 static size_t g_cSize = 0;
-size_t local_ZSTD_decompress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decompress(const void* src, size_t srcSize,
+                                    void* dst, size_t dstSize,
+                                    void* buff2)
 {
     (void)src; (void)srcSize;
     return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
@@ -110,14 +127,14 @@ static ZSTD_DCtx* g_zdc = NULL;
 
 #ifndef ZSTD_DLL_IMPORT
 extern size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* ctx, const void* src, size_t srcSize);
-size_t local_ZSTD_decodeLiteralsBlock(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decodeLiteralsBlock(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
 {
     (void)src; (void)srcSize; (void)dst; (void)dstSize;
     return ZSTD_decodeLiteralsBlock((ZSTD_DCtx*)g_zdc, buff2, g_cSize);
 }
 
 extern size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeq, const void* src, size_t srcSize);
-size_t local_ZSTD_decodeSeqHeaders(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decodeSeqHeaders(const void* src, size_t srcSize, void* dst, size_t dstSize, void* buff2)
 {
     int nbSeq;
     (void)src; (void)srcSize; (void)dst; (void)dstSize;
@@ -126,12 +143,18 @@ size_t local_ZSTD_decodeSeqHeaders(void* dst, size_t dstSize, void* buff2, const
 #endif
 
 static ZSTD_CStream* g_cstream= NULL;
-size_t local_ZSTD_compressStream(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compressStream(const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity,
+                          void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
-    (void)buff2;
-    ZSTD_initCStream(g_cstream, 1);
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = {1 /* contentSizeHeader*/, 0, 0};
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    ZSTD_initCStream_advanced(g_cstream, NULL, 0, p, ZSTD_CONTENTSIZE_UNKNOWN);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
     buffOut.pos = 0;
@@ -143,12 +166,14 @@ size_t local_ZSTD_compressStream(void* dst, size_t dstCapacity, void* buff2, con
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_end(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_end(const void* src, size_t srcSize,
+                                void* dst, size_t dstCapacity,
+                                void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
     buffOut.pos = 0;
@@ -159,12 +184,14 @@ static size_t local_ZSTD_compress_generic_end(void* dst, size_t dstCapacity, voi
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_continue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_continue(const void* src, size_t srcSize,
+                                     void* dst, size_t dstCapacity,
+                                     void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
     buffOut.pos = 0;
@@ -176,12 +203,14 @@ static size_t local_ZSTD_compress_generic_continue(void* dst, size_t dstCapacity
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_T2_end(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_T2_end(const void* src, size_t srcSize,
+                                   void* dst, size_t dstCapacity,
+                                   void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbWorkers, 2);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
@@ -193,12 +222,14 @@ static size_t local_ZSTD_compress_generic_T2_end(void* dst, size_t dstCapacity,
     return buffOut.pos;
 }
 
-static size_t local_ZSTD_compress_generic_T2_continue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_compress_generic_T2_continue(const void* src, size_t srcSize,
+                                        void* dst, size_t dstCapacity,
+                                        void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
     (void)buff2;
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
     ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbWorkers, 2);
     buffOut.dst = dst;
     buffOut.size = dstCapacity;
@@ -212,7 +243,10 @@ static size_t local_ZSTD_compress_generic_T2_continue(void* dst, size_t dstCapac
 }
 
 static ZSTD_DStream* g_dstream= NULL;
-static size_t local_ZSTD_decompressStream(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t
+local_ZSTD_decompressStream(const void* src, size_t srcSize,
+                            void* dst, size_t dstCapacity,
+                            void* buff2)
 {
     ZSTD_outBuffer buffOut;
     ZSTD_inBuffer buffIn;
@@ -228,34 +262,52 @@ static size_t local_ZSTD_decompressStream(void* dst, size_t dstCapacity, void* b
     return buffOut.pos;
 }
 
-static ZSTD_CCtx* g_zcc = NULL;
-
 #ifndef ZSTD_DLL_IMPORT
-size_t local_ZSTD_compressContinue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_compressContinue(const void* src, size_t srcSize,
+                                          void* dst, size_t dstCapacity,
+                                          void* buff2)
 {
-    (void)buff2;
-    ZSTD_compressBegin(g_zcc, 1 /* compressionLevel */);
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = { 1 /* contentSizeHeader*/, 0, 0 };
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
     return ZSTD_compressEnd(g_zcc, dst, dstCapacity, src, srcSize);
 }
 
 #define FIRST_BLOCK_SIZE 8
-size_t local_ZSTD_compressContinue_extDict(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_compressContinue_extDict(const void* src, size_t srcSize,
+                                                  void* dst, size_t dstCapacity,
+                                                  void* buff2)
 {
     BYTE firstBlockBuf[FIRST_BLOCK_SIZE];
 
-    (void)buff2;
+    ZSTD_parameters p;
+    ZSTD_frameParameters f = { 1, 0, 0 };
+    p.fParams = f;
+    p.cParams = *(ZSTD_compressionParameters*)buff2;
+    ZSTD_compressBegin_advanced(g_zcc, NULL, 0, p, srcSize);
     memcpy(firstBlockBuf, src, FIRST_BLOCK_SIZE);
-    ZSTD_compressBegin(g_zcc, 1);
 
-    {   size_t const compressResult = ZSTD_compressContinue(g_zcc, dst, dstCapacity, firstBlockBuf, FIRST_BLOCK_SIZE);
-        if (ZSTD_isError(compressResult)) { DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n", ZSTD_getErrorName(compressResult)); return compressResult; }
+    {   size_t const compressResult = ZSTD_compressContinue(g_zcc,
+                                            dst, dstCapacity,
+                                            firstBlockBuf, FIRST_BLOCK_SIZE);
+        if (ZSTD_isError(compressResult)) {
+            DISPLAY("local_ZSTD_compressContinue_extDict error : %s\n",
+                    ZSTD_getErrorName(compressResult));
+            return compressResult;
+        }
         dst = (BYTE*)dst + compressResult;
         dstCapacity -= compressResult;
     }
-    return ZSTD_compressEnd(g_zcc, dst, dstCapacity, (const BYTE*)src + FIRST_BLOCK_SIZE, srcSize - FIRST_BLOCK_SIZE);
+    return ZSTD_compressEnd(g_zcc, dst, dstCapacity,
+                            (const BYTE*)src + FIRST_BLOCK_SIZE,
+                            srcSize - FIRST_BLOCK_SIZE);
 }
 
-size_t local_ZSTD_decompressContinue(void* dst, size_t dstCapacity, void* buff2, const void* src, size_t srcSize)
+static size_t local_ZSTD_decompressContinue(const void* src, size_t srcSize,
+                                            void* dst, size_t dstCapacity,
+                                            void* buff2)
 {
     size_t regeneratedSize = 0;
     const BYTE* ip = (const BYTE*)buff2;
@@ -263,7 +315,7 @@ size_t local_ZSTD_decompressContinue(void* dst, size_t dstCapacity, void* buff2,
     BYTE* op = (BYTE*)dst;
     size_t remainingCapacity = dstCapacity;
 
-    (void)src; (void)srcSize;
+    (void)src; (void)srcSize;  /* unused */
     ZSTD_decompressBegin(g_zdc);
     while (ip < iend) {
         size_t const iSize = ZSTD_nextSrcSizeToDecompress(g_zdc);
@@ -282,27 +334,30 @@ size_t local_ZSTD_decompressContinue(void* dst, size_t dstCapacity, void* buff2,
 /*_*******************************************************
 *  Bench functions
 *********************************************************/
-static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
+static size_t benchMem(U32 benchNb,
+                       const void* src, size_t srcSize,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
+    size_t dstBuffSize = ZSTD_compressBound(srcSize);
     BYTE*  dstBuff;
-    size_t const dstBuffSize = ZSTD_compressBound(srcSize);
+    void*  dstBuff2;
     void*  buff2;
     const char* benchName;
-    size_t (*benchFunction)(void* dst, size_t dstSize, void* verifBuff, const void* src, size_t srcSize);
-    double bestTime = 100000000.;
+    BMK_benchFn_t benchFunction;
+    int errorcode = 0;
 
     /* Selection */
     switch(benchNb)
     {
     case 1:
-        benchFunction = local_ZSTD_compress; benchName = "compress(1)";
+        benchFunction = local_ZSTD_compress; benchName = "compress";
         break;
     case 2:
         benchFunction = local_ZSTD_decompress; benchName = "decompress";
         break;
 #ifndef ZSTD_DLL_IMPORT
     case 11:
-        benchFunction = local_ZSTD_compressContinue; benchName = "compressContinue(1)";
+        benchFunction = local_ZSTD_compressContinue; benchName = "compressContinue";
         break;
     case 12:
         benchFunction = local_ZSTD_compressContinue_extDict; benchName = "compressContinue_extDict";
@@ -318,7 +373,7 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
         break;
 #endif
     case 41:
-        benchFunction = local_ZSTD_compressStream; benchName = "compressStream(1)";
+        benchFunction = local_ZSTD_compressStream; benchName = "compressStream";
         break;
     case 42:
         benchFunction = local_ZSTD_decompressStream; benchName = "decompressStream";
@@ -341,32 +396,65 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
 
     /* Allocation */
     dstBuff = (BYTE*)malloc(dstBuffSize);
-    buff2 = malloc(dstBuffSize);
-    if ((!dstBuff) || (!buff2)) {
+    dstBuff2 = malloc(dstBuffSize);
+    if ((!dstBuff) || (!dstBuff2)) {
         DISPLAY("\nError: not enough memory!\n");
-        free(dstBuff); free(buff2);
+        free(dstBuff); free(dstBuff2);
         return 12;
     }
+    buff2 = dstBuff2;
     if (g_zcc==NULL) g_zcc = ZSTD_createCCtx();
     if (g_zdc==NULL) g_zdc = ZSTD_createDCtx();
     if (g_cstream==NULL) g_cstream = ZSTD_createCStream();
     if (g_dstream==NULL) g_dstream = ZSTD_createDStream();
 
+    /* DISPLAY("params: cLevel %d, wlog %d hlog %d clog %d slog %d slen %d tlen %d strat %d \n",
+          cLevel, cparams->windowLog, cparams->hashLog, cparams->chainLog, cparams->searchLog,
+          cparams->searchLength, cparams->targetLength, cparams->strategy); */
+
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionLevel, cLevel);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_zcc, ZSTD_p_compressionStrategy, cparams.strategy);
+
+
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, cLevel);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_windowLog, cparams.windowLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_hashLog, cparams.hashLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_chainLog, cparams.chainLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_searchLog, cparams.searchLog);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_minMatch, cparams.searchLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_targetLength, cparams.targetLength);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionStrategy, cparams.strategy);
+
     /* Preparation */
     switch(benchNb)
     {
+    case 1:
+        buff2 = &cparams;
+        break;
     case 2:
-        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, 1);
+        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
 #ifndef ZSTD_DLL_IMPORT
+    case 11:
+        buff2 = &cparams;
+        break;
+    case 12:
+        buff2 = &cparams;
+        break;
     case 13 :
-        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, 1);
+        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
     case 31:  /* ZSTD_decodeLiteralsBlock */
         {   blockProperties_t bp;
             ZSTD_frameHeader zfp;
             size_t frameHeaderSize, skippedSize;
-            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, 1);
+            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, cLevel);
             frameHeaderSize = ZSTD_getFrameHeader(&zfp, dstBuff, ZSTD_frameHeaderSize_min);
             if (frameHeaderSize==0) frameHeaderSize = ZSTD_frameHeaderSize_min;
             ZSTD_getcBlockSize(dstBuff+frameHeaderSize, dstBuffSize, &bp);  /* Get 1st block type */
@@ -386,8 +474,8 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
             const BYTE* ip = dstBuff;
             const BYTE* iend;
             size_t frameHeaderSize, cBlockSize;
-            ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, 1);   /* it would be better to use direct block compression here */
-            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, 1);
+            ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, cLevel);   /* it would be better to use direct block compression here */
+            g_cSize = ZSTD_compress(dstBuff, dstBuffSize, src, srcSize, cLevel);
             frameHeaderSize = ZSTD_getFrameHeader(&zfp, dstBuff, ZSTD_frameHeaderSize_min);
             if (frameHeaderSize==0) frameHeaderSize = ZSTD_frameHeaderSize_min;
             ip += frameHeaderSize;   /* Skip frame Header */
@@ -409,8 +497,11 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
     case 31:
         goto _cleanOut;
 #endif
+    case 41 :
+        buff2 = &cparams;
+        break;
     case 42 :
-        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, 1);
+        g_cSize = ZSTD_compress(buff2, dstBuffSize, src, srcSize, cLevel);
         break;
 
     /* test functions */
@@ -419,138 +510,190 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
     default : ;
     }
 
-     /* warming up memory */
+     /* warming up dstBuff */
     { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }
 
     /* benchmark loop */
-    {   U32 loopNb;
-        U32 nbRounds = (U32)((50 MB) / (srcSize+1)) + 1;   /* initial conservative speed estimate */
-#       define TIME_SEC_MICROSEC    (1*1000000ULL) /* 1 second */
-#       define TIME_SEC_NANOSEC     (1*1000000000ULL) /* 1 second */
-        DISPLAY("%2i- %-30.30s : \r", benchNb, benchName);
-        for (loopNb = 1; loopNb <= g_nbIterations; loopNb++) {
-            UTIL_time_t clockStart;
-            size_t benchResult=0;
-            U32 roundNb;
+    {   BMK_timedFnState_t* const tfs = BMK_createTimedFnState(g_nbIterations * 1000, 1000);
+        BMK_runTime_t bestResult;
+        bestResult.sumOfReturn = 0;
+        bestResult.nanoSecPerRun = (unsigned long long)(-1LL);
+        assert(tfs != NULL);
+        for (;;) {
+            void* const dstBuffv = dstBuff;
+            BMK_runOutcome_t const bOutcome =
+                    BMK_benchTimedFn( tfs,
+                            benchFunction, buff2,
+                            NULL, NULL,   /* initFn */
+                            1,  /* blockCount */
+                            &src, &srcSize,
+                            &dstBuffv, &dstBuffSize,
+                            NULL);
 
-            UTIL_sleepMilli(5);  /* give processor time to other processes */
-            UTIL_waitForNextTick();
-            clockStart = UTIL_getTime();
-            for (roundNb=0; roundNb < nbRounds; roundNb++) {
-                benchResult = benchFunction(dstBuff, dstBuffSize, buff2, src, srcSize);
-                if (ZSTD_isError(benchResult)) {
-                    DISPLAY("ERROR ! %s() => %s !! \n", benchName, ZSTD_getErrorName(benchResult));
-                    exit(1);
-            }   }
-            {   U64 const clockSpanNano = UTIL_clockSpanNano(clockStart);
-                double const averageTime = (double)clockSpanNano / TIME_SEC_NANOSEC / nbRounds;
-                if (clockSpanNano > 0) {
-                    if (averageTime < bestTime) bestTime = averageTime;
-                    assert(bestTime > (1./2000000000));
-                    nbRounds = (U32)(1. / bestTime);   /* aim for 1 sec */
-                    DISPLAY("%2i- %-30.30s : %7.1f MB/s  (%9u)\r",
-                            loopNb, benchName,
-                            (double)srcSize / (1 MB) / bestTime,
-                            (U32)benchResult);
-                } else {
-                    assert(nbRounds < 40000000);  /* avoid overflow */
-                    nbRounds *= 100;
-                }
-    }   }   }
-    DISPLAY("%2u\n", benchNb);
+            if (!BMK_isSuccessful_runOutcome(bOutcome)) {
+                DISPLAY("ERROR benchmarking function ! ! \n");
+                errorcode = 1;
+                goto _cleanOut;
+            }
+
+            {   BMK_runTime_t const newResult = BMK_extract_runTime(bOutcome);
+                if (newResult.nanoSecPerRun < bestResult.nanoSecPerRun )
+                    bestResult.nanoSecPerRun = newResult.nanoSecPerRun;
+                DISPLAY("\r%2u#%-29.29s:%8.1f MB/s  (%8u) ",
+                        benchNb, benchName,
+                        (double)srcSize * TIMELOOP_NANOSEC / bestResult.nanoSecPerRun / MB_UNIT,
+                        (unsigned)newResult.sumOfReturn );
+            }
+
+            if ( BMK_isCompleted_TimedFn(tfs) ) break;
+        }
+        BMK_freeTimedFnState(tfs);
+    }
+    DISPLAY("\n");
 
 _cleanOut:
     free(dstBuff);
-    free(buff2);
+    free(dstBuff2);
     ZSTD_freeCCtx(g_zcc); g_zcc=NULL;
     ZSTD_freeDCtx(g_zdc); g_zdc=NULL;
     ZSTD_freeCStream(g_cstream); g_cstream=NULL;
     ZSTD_freeDStream(g_dstream); g_dstream=NULL;
-    return 0;
+    return errorcode;
 }
 
 
-static int benchSample(U32 benchNb)
+static int benchSample(U32 benchNb,
+                       int cLevel, ZSTD_compressionParameters cparams)
 {
     size_t const benchedSize = g_sampleSize;
-    const char* name = "Sample 10MiB";
+    const char* const name = "Sample 10MiB";
 
     /* Allocation */
-    void* origBuff = malloc(benchedSize);
+    void* const origBuff = malloc(benchedSize);
     if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); return 12; }
 
     /* Fill buffer */
     RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);
 
     /* bench */
-    DISPLAY("\r%79s\r", "");
+    DISPLAY("\r%70s\r", "");
     DISPLAY(" %s : \n", name);
-    if (benchNb)
-        benchMem(origBuff, benchedSize, benchNb);
-    else
-        for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb);
+    if (benchNb) {
+        benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    } else {  /* 0 == run all tests */
+        for (benchNb=0; benchNb<100; benchNb++) {
+            benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+    }   }
 
     free(origBuff);
     return 0;
 }
 
 
-static int benchFiles(const char** fileNamesTable, const int nbFiles, U32 benchNb)
+static int benchFiles(U32 benchNb,
+                      const char** fileNamesTable, const int nbFiles,
+                      int cLevel, ZSTD_compressionParameters cparams)
 {
     /* Loop for each file */
     int fileIdx;
     for (fileIdx=0; fileIdx<nbFiles; fileIdx++) {
         const char* const inFileName = fileNamesTable[fileIdx];
         FILE* const inFile = fopen( inFileName, "rb" );
-        U64   inFileSize;
         size_t benchedSize;
-        void* origBuff;
 
         /* Check file existence */
         if (inFile==NULL) { DISPLAY( "Pb opening %s\n", inFileName); return 11; }
 
         /* Memory allocation & restrictions */
-        inFileSize = UTIL_getFileSize(inFileName);
-        if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-            DISPLAY( "Cannot measure size of %s\n", inFileName);
-            fclose(inFile);
-            return 11;
-        }
-        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-        if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-        if (benchedSize < inFileSize)
-            DISPLAY("Not enough memory for '%s' full size; testing %u MB only...\n", inFileName, (U32)(benchedSize>>20));
-
-        /* Alloc */
-        origBuff = malloc(benchedSize);
-        if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
-
-        /* Fill input buffer */
-        DISPLAY("Loading %s...       \r", inFileName);
-        {
-            size_t readSize = fread(origBuff, 1, benchedSize, inFile);
-            fclose(inFile);
-            if (readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-                free(origBuff);
-                return 13;
+        {   U64 const inFileSize = UTIL_getFileSize(inFileName);
+            if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
+                DISPLAY( "Cannot measure size of %s\n", inFileName);
+                fclose(inFile);
+                return 11;
+            }
+            benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
+            if ((U64)benchedSize > inFileSize)
+                benchedSize = (size_t)inFileSize;
+            if ((U64)benchedSize < inFileSize) {
+                DISPLAY("Not enough memory for '%s' full size; testing %u MB only... \n",
+                        inFileName, (U32)(benchedSize>>20));
         }   }
 
-        /* bench */
-        DISPLAY("\r%79s\r", "");
-        DISPLAY(" %s : \n", inFileName);
-        if (benchNb)
-            benchMem(origBuff, benchedSize, benchNb);
-        else
-            for (benchNb=0; benchNb<100; benchNb++) benchMem(origBuff, benchedSize, benchNb);
+        /* Alloc */
+        {   void* const origBuff = malloc(benchedSize);
+            if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); fclose(inFile); return 12; }
 
-        free(origBuff);
-    }
+            /* Fill input buffer */
+            DISPLAY("Loading %s...       \r", inFileName);
+            {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
+                fclose(inFile);
+                if (readSize != benchedSize) {
+                    DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
+                    free(origBuff);
+                    return 13;
+            }   }
+
+            /* bench */
+            DISPLAY("\r%70s\r", "");   /* blank line */
+            DISPLAY(" %s : \n", inFileName);
+            if (benchNb) {
+                benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            } else {
+                for (benchNb=0; benchNb<100; benchNb++) {
+                    benchMem(benchNb, origBuff, benchedSize, cLevel, cparams);
+            }   }
+
+            free(origBuff);
+    }   }
 
     return 0;
 }
 
 
+
+/*_*******************************************************
+*  Argument Parsing
+*********************************************************/
+
+#define ERROR_OUT(msg) { DISPLAY("%s \n", msg); exit(1); }
+
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) ERROR_OUT(errorMsg);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) ERROR_OUT(errorMsg);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) ERROR_OUT(errorMsg);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+/*_*******************************************************
+*  Command line
+*********************************************************/
+
 static int usage(const char* exename)
 {
     DISPLAY( "Usage :\n");
@@ -567,6 +710,8 @@ static int usage_advanced(const char* exename)
     DISPLAY( " -b#    : test only function # \n");
     DISPLAY( " -i#    : iteration loops [1-9](default : %i)\n", NBLOOPS);
     DISPLAY( " -P#    : sample compressibility (default : %.1f%%)\n", COMPRESSIBILITY_DEFAULT * 100);
+    DISPLAY( " -l#    : benchmark functions at that compression level (default : %i)\n", DEFAULT_CLEVEL);
+    DISPLAY( " --zstd : custom parameter selection. Format same as zstdcli \n");
     return 0;
 }
 
@@ -579,23 +724,45 @@ static int badusage(const char* exename)
 
 int main(int argc, const char** argv)
 {
-    int i, filenamesStart=0, result;
-    const char* exename = argv[0];
+    int argNb, filenamesStart=0, result;
+    const char* const exename = argv[0];
     const char* input_filename = NULL;
     U32 benchNb = 0, main_pause = 0;
+    int cLevel = DEFAULT_CLEVEL;
+    ZSTD_compressionParameters cparams = ZSTD_getCParams(cLevel, 0, 0);
 
     DISPLAY(WELCOME_MESSAGE);
     if (argc<1) return badusage(exename);
 
-    for(i=1; i<argc; i++) {
-        const char* argument = argv[i];
+    for (argNb=1; argNb<argc; argNb++) {
+        const char* argument = argv[argNb];
         assert(argument != NULL);
 
-        /* Commands (note : aggregated commands are allowed) */
-        if (argument[0]=='-') {
+        if (longCommandWArg(&argument, "--zstd=")) {
+            for ( ; ;) {
+                if (longCommandWArg(&argument, "windowLog=") || longCommandWArg(&argument, "wlog=")) { cparams.windowLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "chainLog=") || longCommandWArg(&argument, "clog=")) { cparams.chainLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "hashLog=") || longCommandWArg(&argument, "hlog=")) { cparams.hashLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "searchLog=") || longCommandWArg(&argument, "slog=")) { cparams.searchLog = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "searchLength=") || longCommandWArg(&argument, "slen=")) { cparams.searchLength = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "targetLength=") || longCommandWArg(&argument, "tlen=")) { cparams.targetLength = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "strategy=") || longCommandWArg(&argument, "strat=")) { cparams.strategy = (ZSTD_strategy)(readU32FromChar(&argument)); if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { cLevel = (int)readU32FromChar(&argument); cparams = ZSTD_getCParams(cLevel, 0, 0); if (argument[0]==',') { argument++; continue; } else break; }
+                DISPLAY("invalid compression parameter \n");
+                return 1;
+            }
 
-            while (argument[1]!=0) {
-                argument++;
+            /* check end of string */
+            if (argument[0] != 0) {
+                DISPLAY("invalid --zstd= format \n");
+                return 1;
+            } else {
+                continue;
+            }
+
+        } else if (argument[0]=='-') { /* Commands (note : aggregated commands are allowed) */
+            argument++;
+            while (argument[0]!=0) {
 
                 switch(argument[0])
                 {
@@ -608,33 +775,25 @@ int main(int argc, const char** argv)
 
                     /* Select specific algorithm to bench */
                 case 'b':
-                    benchNb = 0;
-                    while ((argument[1]>= '0') && (argument[1]<= '9')) {
-                        benchNb *= 10;
-                        benchNb += argument[1] - '0';
-                        argument++;
-                    }
+                    argument++;
+                    benchNb = readU32FromChar(&argument);
                     break;
 
                     /* Modify Nb Iterations */
                 case 'i':
-                    if ((argument[1] >='0') && (argument[1] <='9')) {
-                        int iters = argument[1] - '0';
-                        BMK_SetNbIterations(iters);
-                        argument++;
-                    }
+                    argument++;
+                    BMK_SetNbIterations((int)readU32FromChar(&argument));
                     break;
 
                     /* Select compressibility of synthetic sample */
                 case 'P':
-                    {   U32 proba32 = 0;
-                        while ((argument[1]>= '0') && (argument[1]<= '9')) {
-                            proba32 *= 10;
-                            proba32 += argument[1] - '0';
-                            argument++;
-                        }
-                        g_compressibility = (double)proba32 / 100.;
-                    }
+                    argument++;
+                    g_compressibility = (double)readU32FromChar(&argument) / 100.;
+                    break;
+                case 'l':
+                    argument++;
+                    cLevel = readU32FromChar(&argument);
+                    cparams = ZSTD_getCParams(cLevel, 0, 0);
                     break;
 
                     /* Unknown command */
@@ -645,13 +804,15 @@ int main(int argc, const char** argv)
         }
 
         /* first provided filename is input */
-        if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
+        if (!input_filename) { input_filename=argument; filenamesStart=argNb; continue; }
     }
 
+
+
     if (filenamesStart==0)   /* no input file */
-        result = benchSample(benchNb);
+        result = benchSample(benchNb, cLevel, cparams);
     else
-        result = benchFiles(argv+filenamesStart, argc-filenamesStart, benchNb);
+        result = benchFiles(benchNb, argv+filenamesStart, argc-filenamesStart, cLevel, cparams);
 
     if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }
 
diff --git a/tests/fuzz/fuzz.h b/tests/fuzz/fuzz.h
index a64845473c2b..8850025b0fd2 100644
--- a/tests/fuzz/fuzz.h
+++ b/tests/fuzz/fuzz.h
@@ -23,10 +23,10 @@
  *        the data to zstd functions. Every fuzzer initializes the RNG exactly
  *        once before doing anything else, even if it is unused.
  *        Default: 4.
- * @param ZSTD_DEBUG:
- *        This is a parameter for the zstd library. Defining `ZSTD_DEBUG=1`
+ * @param DEBUGLEVEL:
+ *        This is a parameter for the zstd library. Defining `DEBUGLEVEL=1`
  *        enables assert() statements in the zstd library. Higher levels enable
- *        logging, so aren't recommended. Defining `ZSTD_DEBUG=1` is
+ *        logging, so aren't recommended. Defining `DEBUGLEVEL=1` is
  *        recommended.
  * @param MEM_FORCE_MEMORY_ACCESS:
  *        This flag controls how the zstd library accesses unaligned memory.
diff --git a/tests/fuzz/fuzz.py b/tests/fuzz/fuzz.py
index b591e4f6734e..8ce293a3a695 100755
--- a/tests/fuzz/fuzz.py
+++ b/tests/fuzz/fuzz.py
@@ -13,6 +13,7 @@ import argparse
 import contextlib
 import os
 import re
+import shlex
 import shutil
 import subprocess
 import sys
@@ -147,15 +148,18 @@ def compiler_version(cc, cxx):
     """
     cc_version_bytes = subprocess.check_output([cc, "--version"])
     cxx_version_bytes = subprocess.check_output([cxx, "--version"])
-    if cc_version_bytes.startswith(b'clang'):
-        assert(cxx_version_bytes.startswith(b'clang'))
+    compiler = None
+    version = None
+    if b'clang' in cc_version_bytes:
+        assert(b'clang' in cxx_version_bytes)
         compiler = 'clang'
-    if cc_version_bytes.startswith(b'gcc'):
-        assert(cxx_version_bytes.startswith(b'g++'))
+    elif b'gcc' in cc_version_bytes:
+        assert(b'gcc' in cxx_version_bytes)
         compiler = 'gcc'
-    version_regex = b'([0-9])+\.([0-9])+\.([0-9])+'
-    version_match = re.search(version_regex, cc_version_bytes)
-    version = tuple(int(version_match.group(i)) for i in range(1, 4))
+    if compiler is not None:
+        version_regex = b'([0-9])+\.([0-9])+\.([0-9])+'
+        version_match = re.search(version_regex, cc_version_bytes)
+        version = tuple(int(version_match.group(i)) for i in range(1, 4))
     return compiler, version
 
 
@@ -248,7 +252,7 @@ def build_parser(args):
         dest='debug',
         type=int,
         default=1,
-        help='Set ZSTD_DEBUG (default: 1)')
+        help='Set DEBUGLEVEL (default: 1)')
     parser.add_argument(
         '--force-memory-access',
         dest='memory_access',
@@ -265,7 +269,7 @@ def build_parser(args):
         '--disable-fuzzing-mode',
         dest='fuzzing_mode',
         action='store_false',
-        help='Do not define FUZZING_BUILD_MORE_UNSAFE_FOR_PRODUCTION')
+        help='Do not define FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION')
     parser.add_argument(
         '--enable-stateful-fuzzing',
         dest='stateful_fuzzing',
@@ -346,16 +350,16 @@ def build(args):
     targets = args.TARGET
     cc = args.cc
     cxx = args.cxx
-    cppflags = [args.cppflags]
-    cflags = [args.cflags]
-    ldflags = [args.ldflags]
-    cxxflags = [args.cxxflags]
-    mflags = [args.mflags] if args.mflags else []
+    cppflags = shlex.split(args.cppflags)
+    cflags = shlex.split(args.cflags)
+    ldflags = shlex.split(args.ldflags)
+    cxxflags = shlex.split(args.cxxflags)
+    mflags = shlex.split(args.mflags)
     # Flags to be added to both cflags and cxxflags
     common_flags = []
 
     cppflags += [
-        '-DZSTD_DEBUG={}'.format(args.debug),
+        '-DDEBUGLEVEL={}'.format(args.debug),
         '-DMEM_FORCE_MEMORY_ACCESS={}'.format(args.memory_access),
         '-DFUZZ_RNG_SEED_SIZE={}'.format(args.fuzz_rng_seed_size),
     ]
@@ -399,7 +403,7 @@ def build(args):
         cppflags += ['-DSTATEFUL_FUZZING']
 
     if args.fuzzing_mode:
-        cppflags += ['-DFUZZING_BUILD_MORE_UNSAFE_FOR_PRODUCTION']
+        cppflags += ['-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION']
 
     if args.lib_fuzzing_engine == 'libregression.a':
         targets = ['libregression.a'] + targets
@@ -750,11 +754,10 @@ def zip_cmd(args):
     for target in args.TARGET:
         # Zip the seed_corpus
         seed_corpus = abs_join(CORPORA_DIR, "{}_seed_corpus".format(target))
-        seeds = [abs_join(seed_corpus, f) for f in os.listdir(seed_corpus)]
         zip_file = "{}.zip".format(seed_corpus)
-        cmd = ["zip", "-q", "-j", "-9", zip_file]
-        print(' '.join(cmd + [abs_join(seed_corpus, '*')]))
-        subprocess.check_call(cmd + seeds)
+        cmd = ["zip", "-r", "-q", "-j", "-9", zip_file, "."]
+        print(' '.join(cmd))
+        subprocess.check_call(cmd, cwd=seed_corpus)
 
 
 def list_cmd(args):
diff --git a/tests/fuzz/regression_driver.c b/tests/fuzz/regression_driver.c
index 2b714d29e9dd..1553d436ce03 100644
--- a/tests/fuzz/regression_driver.c
+++ b/tests/fuzz/regression_driver.c
@@ -16,7 +16,7 @@
 #include <stdlib.h>
 
 int main(int argc, char const **argv) {
-  size_t const kMaxFileSize = (size_t)1 << 20;
+  size_t const kMaxFileSize = (size_t)1 << 27;
   int const kFollowLinks = 1;
   char *fileNamesBuf = NULL;
   char const **files = argv + 1;
diff --git a/tests/fuzz/zstd_helpers.c b/tests/fuzz/zstd_helpers.c
index 6fc38361b7ad..bf5eccff83c7 100644
--- a/tests/fuzz/zstd_helpers.c
+++ b/tests/fuzz/zstd_helpers.c
@@ -34,8 +34,7 @@ ZSTD_compressionParameters FUZZ_randomCParams(size_t srcSize, uint32_t *state)
     cParams.searchLog = FUZZ_rand32(state, ZSTD_SEARCHLOG_MIN, 9);
     cParams.searchLength = FUZZ_rand32(state, ZSTD_SEARCHLENGTH_MIN,
                                               ZSTD_SEARCHLENGTH_MAX);
-    cParams.targetLength = FUZZ_rand32(state, ZSTD_TARGETLENGTH_MIN,
-                                              512);
+    cParams.targetLength = FUZZ_rand32(state, 0, 512);
     cParams.strategy = FUZZ_rand32(state, ZSTD_fast, ZSTD_btultra);
     return ZSTD_adjustCParams(cParams, srcSize, 0);
 }
@@ -72,6 +71,7 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, uint32_t *state)
     setRand(cctx, ZSTD_p_contentSizeFlag, 0, 1, state);
     setRand(cctx, ZSTD_p_checksumFlag, 0, 1, state);
     setRand(cctx, ZSTD_p_dictIDFlag, 0, 1, state);
+    setRand(cctx, ZSTD_p_forceAttachDict, -2, 2, state);
     /* Select long distance matchig parameters */
     setRand(cctx, ZSTD_p_enableLongDistanceMatching, 0, 1, state);
     setRand(cctx, ZSTD_p_ldmHashLog, ZSTD_HASHLOG_MIN, 16, state);
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index e97b841e8535..5616285b9ed7 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -27,6 +27,7 @@
 #include <string.h>       /* strcmp */
 #include <assert.h>
 #define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressContinue, ZSTD_compressBlock */
+#include "fse.h"
 #include "zstd.h"         /* ZSTD_VERSION_STRING */
 #include "zstd_errors.h"  /* ZSTD_getErrorCode */
 #include "zstdmt_compress.h"
@@ -66,14 +67,20 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
             if (g_displayLevel>=4) fflush(stderr); } }
 
 
+/*-*******************************************************
+*  Compile time test
+*********************************************************/
 #undef MIN
 #undef MAX
+/* Declaring the function is it isn't unused */
+void FUZ_bug976(void);
 void FUZ_bug976(void)
 {   /* these constants shall not depend on MIN() macro */
     assert(ZSTD_HASHLOG_MAX < 31);
     assert(ZSTD_CHAINLOG_MAX < 31);
 }
 
+
 /*-*******************************************************
 *  Internal functions
 *********************************************************/
@@ -117,6 +124,13 @@ static unsigned FUZ_highbit32(U32 v32)
 #define CHECK(fn)  { CHECK_V(err, fn); }
 #define CHECKPLUS(var, fn, more)  { CHECK_V(var, fn); more; }
 
+#define CHECK_EQ(lhs, rhs) {                                      \
+    if ((lhs) != (rhs)) {                                         \
+        DISPLAY("Error L%u => %s != %s ", __LINE__, #lhs, #rhs);  \
+        goto _output_error;                                       \
+    }                                                             \
+}
+
 
 /*=============================================
 *   Memory Tests
@@ -167,13 +181,9 @@ static void FUZ_displayMallocStats(mallocCounter_t count)
         (U32)(count.totalMalloc >> 10));
 }
 
-static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+static int FUZ_mallocTests_internal(unsigned seed, double compressibility, unsigned part,
+                void* inBuffer, size_t inSize, void* outBuffer, size_t outSize)
 {
-    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
-    size_t const outSize = ZSTD_compressBound(inSize);
-    void* const inBuffer = malloc(inSize);
-    void* const outBuffer = malloc(outSize);
-
     /* test only played in verbose mode, as they are long */
     if (g_displayLevel<3) return 0;
 
@@ -258,6 +268,28 @@ static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
     return 0;
 }
 
+static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
+{
+    size_t const inSize = 64 MB + 16 MB + 4 MB + 1 MB + 256 KB + 64 KB; /* 85.3 MB */
+    size_t const outSize = ZSTD_compressBound(inSize);
+    void* const inBuffer = malloc(inSize);
+    void* const outBuffer = malloc(outSize);
+    int result;
+
+    /* Create compressible noise */
+    if (!inBuffer || !outBuffer) {
+        DISPLAY("Not enough memory, aborting \n");
+        exit(1);
+    }
+
+    result = FUZ_mallocTests_internal(seed, compressibility, part,
+                    inBuffer, inSize, outBuffer, outSize);
+
+    free(inBuffer);
+    free(outBuffer);
+    return result;
+}
+
 #else
 
 static int FUZ_mallocTests(unsigned seed, double compressibility, unsigned part)
@@ -303,9 +335,13 @@ static int basicUnitTests(U32 seed, double compressibility)
         DISPLAYLEVEL(3, "OK : %s \n", errorString);
     }
 
+    DISPLAYLEVEL(3, "test%3i : min compression level : ", testNb++);
+    {   int const mcl = ZSTD_minCLevel();
+        DISPLAYLEVEL(3, "%i (OK) \n", mcl);
+    }
 
     DISPLAYLEVEL(3, "test%3i : compress %u bytes : ", testNb++, (U32)CNBuffSize);
-    {   ZSTD_CCtx* cctx = ZSTD_createCCtx();
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
         if (cctx==NULL) goto _output_error;
         CHECKPLUS(r, ZSTD_compressCCtx(cctx,
                             compressedBuffer, compressedBufferSize,
@@ -368,6 +404,12 @@ static int basicUnitTests(U32 seed, double compressibility)
       if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error; }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : decompress too large input : ", testNb++);
+    { size_t const r = ZSTD_decompress(decodedBuffer, CNBuffSize, compressedBuffer, compressedBufferSize);
+      if (!ZSTD_isError(r)) goto _output_error;
+      if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error; }
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3d : check CCtx size after compressing empty input : ", testNb++);
     {   ZSTD_CCtx* cctx = ZSTD_createCCtx();
         size_t const r = ZSTD_compressCCtx(cctx, compressedBuffer, compressedBufferSize, NULL, 0, 19);
@@ -394,14 +436,80 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
-    DISPLAYLEVEL(3, "test%3d : large window log smaller data : ", testNb++);
+    DISPLAYLEVEL(3, "test%3d : re-using a CCtx should compress the same : ", testNb++);
+    {   int i;
+        for (i=0; i<20; i++)
+            ((char*)CNBuffer)[i] = (char)i;   /* ensure no match during initial section */
+        memcpy((char*)CNBuffer + 20, CNBuffer, 10);   /* create one match, starting from beginning of sample, which is the difficult case (see #1241) */
+        for (i=1; i<=19; i++) {
+            ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+            size_t size1, size2;
+            DISPLAYLEVEL(5, "l%i ", i);
+            size1 = ZSTD_compressCCtx(cctx, compressedBuffer, compressedBufferSize, CNBuffer, 30, i);
+            CHECK_Z(size1);
+            size2 = ZSTD_compressCCtx(cctx, compressedBuffer, compressedBufferSize, CNBuffer, 30, i);
+            CHECK_Z(size2);
+            CHECK_EQ(size1, size2);
+
+            ZSTD_freeCCtx(cctx);
+        }
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3d : ZSTD_CCtx_getParameter() : ", testNb++);
     {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
-        ZSTD_parameters params = ZSTD_getParams(1, ZSTD_CONTENTSIZE_UNKNOWN, 0);
-        size_t const nbCompressions = (1U << 31) / CNBuffSize + 1;
-        size_t i;
+        ZSTD_outBuffer out = {NULL, 0, 0};
+        ZSTD_inBuffer in = {NULL, 0, 0};
+        unsigned value;
+
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_compressionLevel, &value));
+        CHECK_EQ(value, 3);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_hashLog, &value));
+        CHECK_EQ(value, 0);
+        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_p_hashLog, ZSTD_HASHLOG_MIN));
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_compressionLevel, &value));
+        CHECK_EQ(value, 3);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_hashLog, &value));
+        CHECK_EQ(value, ZSTD_HASHLOG_MIN);
+        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_p_compressionLevel, 7));
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_compressionLevel, &value));
+        CHECK_EQ(value, 7);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_hashLog, &value));
+        CHECK_EQ(value, ZSTD_HASHLOG_MIN);
+        /* Start a compression job */
+        ZSTD_compress_generic(cctx, &out, &in, ZSTD_e_continue);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_compressionLevel, &value));
+        CHECK_EQ(value, 7);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_hashLog, &value));
+        CHECK_EQ(value, ZSTD_HASHLOG_MIN);
+        /* Reset the CCtx */
+        ZSTD_CCtx_reset(cctx);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_compressionLevel, &value));
+        CHECK_EQ(value, 7);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_hashLog, &value));
+        CHECK_EQ(value, ZSTD_HASHLOG_MIN);
+        /* Reset the parameters */
+        ZSTD_CCtx_resetParameters(cctx);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_compressionLevel, &value));
+        CHECK_EQ(value, 3);
+        CHECK_Z(ZSTD_CCtx_getParameter(cctx, ZSTD_p_hashLog, &value));
+        CHECK_EQ(value, 0);
+
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    /* this test is really too long, and should be made faster */
+    DISPLAYLEVEL(3, "test%3d : overflow protection with large windowLog : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        ZSTD_parameters params = ZSTD_getParams(-9, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+        size_t const nbCompressions = ((1U << 31) / CNBuffSize) + 1;   /* ensure U32 overflow protection is triggered */
+        size_t cnb;
+        assert(cctx != NULL);
         params.fParams.contentSizeFlag = 0;
         params.cParams.windowLog = ZSTD_WINDOWLOG_MAX;
-        for (i = 0; i < nbCompressions; ++i) {
+        for (cnb = 0; cnb < nbCompressions; ++cnb) {
+            DISPLAYLEVEL(6, "run %zu / %zu \n", cnb, nbCompressions);
             CHECK_Z( ZSTD_compressBegin_advanced(cctx, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN) );  /* re-use same parameters */
             CHECK_Z( ZSTD_compressEnd(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize) );
         }
@@ -409,6 +517,39 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3d : size down context : ", testNb++);
+    {   ZSTD_CCtx* const largeCCtx = ZSTD_createCCtx();
+        assert(largeCCtx != NULL);
+        CHECK_Z( ZSTD_compressBegin(largeCCtx, 19) );   /* streaming implies ZSTD_CONTENTSIZE_UNKNOWN, which maximizes memory usage */
+        CHECK_Z( ZSTD_compressEnd(largeCCtx, compressedBuffer, compressedBufferSize, CNBuffer, 1) );
+        {   size_t const largeCCtxSize = ZSTD_sizeof_CCtx(largeCCtx);   /* size of context must be measured after compression */
+            {   ZSTD_CCtx* const smallCCtx = ZSTD_createCCtx();
+                assert(smallCCtx != NULL);
+                CHECK_Z(ZSTD_compressCCtx(smallCCtx, compressedBuffer, compressedBufferSize, CNBuffer, 1, 1));
+                {   size_t const smallCCtxSize = ZSTD_sizeof_CCtx(smallCCtx);
+                    DISPLAYLEVEL(5, "(large) %zuKB > 32*%zuKB (small) : ",
+                                largeCCtxSize>>10, smallCCtxSize>>10);
+                    assert(largeCCtxSize > 32* smallCCtxSize);  /* note : "too large" definition is handled within zstd_compress.c .
+                                                                 * make this test case extreme, so that it doesn't depend on a possibly fluctuating definition */
+                }
+                ZSTD_freeCCtx(smallCCtx);
+            }
+            {   U32 const maxNbAttempts = 1100;   /* nb of usages before triggering size down is handled within zstd_compress.c.
+                                                   * currently defined as 128x, but could be adjusted in the future.
+                                                   * make this test long enough so that it's not too much tied to the current definition within zstd_compress.c */
+                U32 u;
+                for (u=0; u<maxNbAttempts; u++) {
+                    CHECK_Z(ZSTD_compressCCtx(largeCCtx, compressedBuffer, compressedBufferSize, CNBuffer, 1, 1));
+                    if (ZSTD_sizeof_CCtx(largeCCtx) < largeCCtxSize) break;   /* sized down */
+                }
+                DISPLAYLEVEL(5, "size down after %u attempts : ", u);
+                if (u==maxNbAttempts) goto _output_error;   /* no sizedown happened */
+            }
+        }
+        ZSTD_freeCCtx(largeCCtx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     /* Static CCtx tests */
 #define STATIC_CCTX_LEVEL 3
     DISPLAYLEVEL(3, "test%3i : create static CCtx for level %u :", testNb++, STATIC_CCTX_LEVEL);
@@ -1024,12 +1165,40 @@ static int basicUnitTests(U32 seed, double compressibility)
         ZSTD_freeCCtx(cctx);
     }
 
+    /* negative compression level test : ensure simple API and advanced API produce same result */
+    DISPLAYLEVEL(3, "test%3i : negative compression level : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const srcSize = CNBuffSize / 5;
+        int const compressionLevel = -1;
+
+        assert(cctx != NULL);
+        {   ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize, 0);
+            size_t const cSize_1pass = ZSTD_compress_advanced(cctx,
+                                        compressedBuffer, compressedBufferSize,
+                                        CNBuffer, srcSize,
+                                        NULL, 0,
+                                        params);
+            if (ZSTD_isError(cSize_1pass)) goto _output_error;
+
+            CHECK( ZSTD_CCtx_setParameter(cctx, ZSTD_p_compressionLevel, (unsigned)compressionLevel) );
+            {   ZSTD_inBuffer in = { CNBuffer, srcSize, 0 };
+                ZSTD_outBuffer out = { compressedBuffer, compressedBufferSize, 0 };
+                size_t const compressionResult = ZSTD_compress_generic(cctx, &out, &in, ZSTD_e_end);
+                DISPLAYLEVEL(5, "simple=%zu vs %zu=advanced : ", cSize_1pass, out.pos);
+                if (ZSTD_isError(compressionResult)) goto _output_error;
+                if (out.pos != cSize_1pass) goto _output_error;
+        }   }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     /* parameters order test */
     {   size_t const inputSize = CNBuffSize / 2;
         U64 xxh64;
 
-        {   ZSTD_CCtx* cctx = ZSTD_createCCtx();
+        {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
             DISPLAYLEVEL(3, "test%3i : parameters in order : ", testNb++);
+            assert(cctx != NULL);
             CHECK( ZSTD_CCtx_setParameter(cctx, ZSTD_p_compressionLevel, 2) );
             CHECK( ZSTD_CCtx_setParameter(cctx, ZSTD_p_enableLongDistanceMatching, 1) );
             CHECK( ZSTD_CCtx_setParameter(cctx, ZSTD_p_windowLog, 18) );
@@ -1085,9 +1254,13 @@ static int basicUnitTests(U32 seed, double compressibility)
             DISPLAYLEVEL(3, "OK : %s \n", ZSTD_getErrorName(decodeResult));
         }
 
-        DISPLAYLEVEL(3, "test%3i : decompress with magic-less instruction : ", testNb++);
+        DISPLAYLEVEL(3, "test%3i : decompress of magic-less frame : ", testNb++);
         ZSTD_DCtx_reset(dctx);
         CHECK( ZSTD_DCtx_setFormat(dctx, ZSTD_f_zstd1_magicless) );
+        {   ZSTD_frameHeader zfh;
+            size_t const zfhrt = ZSTD_getFrameHeader_advanced(&zfh, compressedBuffer, cSize, ZSTD_f_zstd1_magicless);
+            if (zfhrt != 0) goto _output_error;
+        }
         {   ZSTD_inBuffer in = { compressedBuffer, cSize, 0 };
             ZSTD_outBuffer out = { decodedBuffer, CNBuffSize, 0 };
             size_t const result = ZSTD_decompress_generic(dctx, &out, &in);
@@ -1120,6 +1293,20 @@ static int basicUnitTests(U32 seed, double compressibility)
           if (r != blockSize) goto _output_error; }
         DISPLAYLEVEL(3, "OK \n");
 
+        /* very long stream of block compression */
+        DISPLAYLEVEL(3, "test%3i : Huge block streaming compression test : ", testNb++);
+        CHECK( ZSTD_compressBegin(cctx, -99) );  /* we just want to quickly overflow internal U32 index */
+        CHECK( ZSTD_getBlockSize(cctx) >= blockSize);
+        {   U64 const toCompress = 5000000000ULL;   /* > 4 GB */
+            U64 compressed = 0;
+            while (compressed < toCompress) {
+                size_t const blockCSize = ZSTD_compressBlock(cctx, compressedBuffer, ZSTD_compressBound(blockSize), CNBuffer, blockSize);
+                if (ZSTD_isError(cSize)) goto _output_error;
+                compressed += blockCSize;
+            }
+        }
+        DISPLAYLEVEL(3, "OK \n");
+
         /* dictionary block compression */
         DISPLAYLEVEL(3, "test%3i : Dictionary Block compression test : ", testNb++);
         CHECK( ZSTD_compressBegin_usingDict(cctx, CNBuffer, dictSize, 5) );
@@ -1142,6 +1329,15 @@ static int basicUnitTests(U32 seed, double compressibility)
           if (r != blockSize) goto _output_error; }
         DISPLAYLEVEL(3, "OK \n");
 
+        DISPLAYLEVEL(3, "test%3i : Block compression with CDict : ", testNb++);
+        {   ZSTD_CDict* const cdict = ZSTD_createCDict(CNBuffer, dictSize, 3);
+            if (cdict==NULL) goto _output_error;
+            CHECK( ZSTD_compressBegin_usingCDict(cctx, cdict) );
+            CHECK( ZSTD_compressBlock(cctx, compressedBuffer, ZSTD_compressBound(blockSize), (char*)CNBuffer+dictSize, blockSize) );
+            ZSTD_freeCDict(cdict);
+        }
+        DISPLAYLEVEL(3, "OK \n");
+
         ZSTD_freeCCtx(cctx);
     }
     ZSTD_freeDCtx(dctx);
@@ -1199,6 +1395,24 @@ static int basicUnitTests(U32 seed, double compressibility)
                 ((BYTE*)CNBuffer)[i+1] = _3BytesSeqs[id][1];
                 ((BYTE*)CNBuffer)[i+2] = _3BytesSeqs[id][2];
     }   }   }
+    DISPLAYLEVEL(3, "test%3i : growing nbSeq : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const maxNbSeq = _3BYTESTESTLENGTH / 3;
+        size_t const bound = ZSTD_compressBound(_3BYTESTESTLENGTH);
+        size_t nbSeq = 1;
+        while (nbSeq <= maxNbSeq) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, nbSeq * 3, 19));
+          /* Check every sequence for the first 100, then skip more rapidly. */
+          if (nbSeq < 100) {
+            ++nbSeq;
+          } else {
+            nbSeq += (nbSeq >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3i : compress lots 3-bytes sequences : ", testNb++);
     { CHECK_V(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(_3BYTESTESTLENGTH),
                                  CNBuffer, _3BYTESTESTLENGTH, 19) );
@@ -1210,8 +1424,26 @@ static int basicUnitTests(U32 seed, double compressibility)
       if (r != _3BYTESTESTLENGTH) goto _output_error; }
     DISPLAYLEVEL(3, "OK \n");
 
-    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
+
+    DISPLAYLEVEL(3, "test%3i : growing literals buffer : ", testNb++);
     RDG_genBuffer(CNBuffer, CNBuffSize, 0.0, 0.1, seed);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const bound = ZSTD_compressBound(CNBuffSize);
+        size_t size = 1;
+        while (size <= CNBuffSize) {
+          CHECK(ZSTD_compressCCtx(cctx, compressedBuffer, bound, CNBuffer, size, 3));
+          /* Check every size for the first 100, then skip more rapidly. */
+          if (size < 100) {
+            ++size;
+          } else {
+            size += (size >> 2);
+          }
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : incompressible data and ill suited dictionary : ", testNb++);
     {   /* Train a dictionary on low characters */
         size_t dictSize = 16 KB;
         void* const dictBuffer = malloc(dictSize);
@@ -1286,6 +1518,24 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : testing FSE_normalizeCount() PR#1255: ", testNb++);
+    {
+        short norm[32];
+        unsigned count[32];
+        unsigned const tableLog = 5;
+        size_t const nbSeq = 32;
+        unsigned const maxSymbolValue = 31;
+        size_t i;
+
+        for (i = 0; i < 32; ++i)
+            count[i] = 1;
+        /* Calling FSE_normalizeCount() on a uniform distribution should not
+         * cause a division by zero.
+         */
+        FSE_normalizeCount(norm, tableLog, count, nbSeq, maxSymbolValue);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
 _end:
     free(CNBuffer);
     free(compressedBuffer);
@@ -1359,7 +1609,6 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     size_t const dstBufferSize = (size_t)1<<maxSampleLog;
     size_t const cBufferSize   = ZSTD_compressBound(dstBufferSize);
     BYTE* cNoiseBuffer[5];
-    BYTE* srcBuffer;   /* jumping pointer */
     BYTE* const cBuffer = (BYTE*) malloc (cBufferSize);
     BYTE* const dstBuffer = (BYTE*) malloc (dstBufferSize);
     BYTE* const mirrorBuffer = (BYTE*) malloc (dstBufferSize);
@@ -1368,7 +1617,7 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     ZSTD_DCtx* const dctx = ZSTD_createDCtx();
     U32 result = 0;
     U32 testNb = 0;
-    U32 coreSeed = seed, lseed = 0;
+    U32 coreSeed = seed;
     UTIL_time_t const startClock = UTIL_getTime();
     U64 const maxClockSpan = maxDurationS * SEC_TO_MICRO;
     int const cLevelLimiter = bigTests ? 3 : 2;
@@ -1389,13 +1638,14 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     RDG_genBuffer(cNoiseBuffer[2], srcBufferSize, compressibility, 0., coreSeed);
     RDG_genBuffer(cNoiseBuffer[3], srcBufferSize, 0.95, 0., coreSeed);    /* highly compressible */
     RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
-    srcBuffer = cNoiseBuffer[2];
 
     /* catch up testNb */
     for (testNb=1; testNb < startTest; testNb++) FUZ_rand(&coreSeed);
 
     /* main test loop */
     for ( ; (testNb <= nbTests) || (UTIL_clockSpanMicro(startClock) < maxClockSpan); testNb++ ) {
+        BYTE* srcBuffer;   /* jumping pointer */
+        U32 lseed;
         size_t sampleSize, maxTestSize, totalTestSize;
         size_t cSize, totalCSize, totalGenSize;
         U64 crcOrig;
@@ -1626,11 +1876,9 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
         CHECK (totalGenSize != totalTestSize, "streaming decompressed data : wrong size")
         CHECK (totalCSize != cSize, "compressed data should be fully read")
         {   U64 const crcDest = XXH64(dstBuffer, totalTestSize, 0);
-            if (crcDest!=crcOrig) {
-                size_t const errorPos = findDiff(mirrorBuffer, dstBuffer, totalTestSize);
-                CHECK (1, "streaming decompressed data corrupted : byte %u / %u  (%02X!=%02X)",
-                   (U32)errorPos, (U32)totalTestSize, dstBuffer[errorPos], mirrorBuffer[errorPos]);
-        }   }
+            CHECK(crcOrig != crcDest, "streaming decompressed data corrupted (pos %u / %u)",
+                (U32)findDiff(mirrorBuffer, dstBuffer, totalTestSize), (U32)totalTestSize);
+        }
     }   /* for ( ; (testNb <= nbTests) */
     DISPLAY("\r%u fuzzer tests completed   \n", testNb-1);
 
diff --git a/tests/gzip/Makefile b/tests/gzip/Makefile
index 40a0ba97d2b3..c5d67206b99d 100644
--- a/tests/gzip/Makefile
+++ b/tests/gzip/Makefile
@@ -33,7 +33,7 @@ clean:
 
 
 #------------------------------------------------------------------------------
-# validated only for Linux, OSX, Hurd and some BSD targets
+# validated only for Linux, macOS, Hurd and some BSD targets
 #------------------------------------------------------------------------------
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU FreeBSD DragonFly NetBSD))
 
diff --git a/tests/legacy.c b/tests/legacy.c
index 847e1d25e96b..e1cf82f2f9d8 100644
--- a/tests/legacy.c
+++ b/tests/legacy.c
@@ -36,7 +36,7 @@ size_t const COMPRESSED_SIZE = 917;
 const char* const EXPECTED; /* content is at end of file */
 
 
-int testSimpleAPI(void)
+static int testSimpleAPI(void)
 {
     size_t const size = strlen(EXPECTED);
     char* const output = malloc(size);
@@ -71,7 +71,8 @@ int testSimpleAPI(void)
     return 0;
 }
 
-int testStreamingAPI(void)
+
+static int testStreamingAPI(void)
 {
     size_t const outBuffSize = ZSTD_DStreamOutSize();
     char* const outBuff = malloc(outBuffSize);
diff --git a/tests/libzstd_partial_builds.sh b/tests/libzstd_partial_builds.sh
new file mode 100755
index 000000000000..34d8ea55231e
--- /dev/null
+++ b/tests/libzstd_partial_builds.sh
@@ -0,0 +1,36 @@
+#!/bin/sh -e
+
+die() {
+    $ECHO "$@" 1>&2
+    exit 1
+}
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+INTOVOID="/dev/null"
+case "$OS" in
+  Windows*)
+    INTOVOID="NUL"
+    ;;
+esac
+
+ZSTD_LIB_COMPRESSION=0 CFLAGS= make -C $DIR/../lib libzstd.a > $INTOVOID
+nm $DIR/../lib/libzstd.a | grep ".*\.o:" > tmplog
+! grep -q "zstd_compress" tmplog && grep -q "zstd_decompress" tmplog && ! grep -q "dict" tmplog && grep -q "zstd_v" tmplog && ! grep -q "zbuff" tmplog && make clean && rm -f tmplog || die "Compression macro failed"
+
+
+ZSTD_LIB_DECOMPRESSION=0 CFLAGS= make -C $DIR/../lib libzstd.a > $INTOVOID
+nm $DIR/../lib/libzstd.a | grep ".*\.o:" > tmplog
+grep -q "zstd_compress" tmplog && ! grep -q "zstd_decompress" tmplog && grep -q "dict" tmplog && ! grep -q "zstd_v" tmplog && ! grep -q "zbuff" tmplog && make clean && rm -f tmplog || die "Decompression macro failed"
+
+ZSTD_LIB_DEPRECATED=0 CFLAGS= make -C $DIR/../lib libzstd.a > $INTOVOID
+nm $DIR/../lib/libzstd.a | grep ".*\.o:" > tmplog
+grep -q "zstd_compress" tmplog && grep -q "zstd_decompress" tmplog && grep -q "dict" tmplog && grep -q "zstd_v" tmplog && ! grep -q "zbuff" tmplog && make clean && rm -f tmplog || die "Deprecated macro failed"
+
+ZSTD_LIB_DICTBUILDER=0 CFLAGS= make -C $DIR/../lib libzstd.a > $INTOVOID
+nm $DIR/../lib/libzstd.a | grep ".*\.o:" > tmplog
+grep -q "zstd_compress" tmplog && grep -q "zstd_decompress" tmplog && ! grep -q "dict" tmplog && grep -q "zstd_v" tmplog && grep -q "zbuff" tmplog && make clean && rm -f tmplog || die "Dictbuilder macro failed"
+
+ZSTD_LIB_DECOMPRESSION=0 ZSTD_LIB_DICTBUILDER=0 CFLAGS= make -C $DIR/../lib libzstd.a > $INTOVOID
+nm $DIR/../lib/libzstd.a | grep ".*\.o:" > tmplog
+grep -q "zstd_compress" tmplog && ! grep -q "zstd_decompress" tmplog && ! grep -q "dict" tmplog && ! grep -q "zstd_v" tmplog && ! grep -q "zbuff" tmplog && make clean && rm -f tmplog || die "Multi-macro failed"
\ No newline at end of file
diff --git a/tests/longmatch.c b/tests/longmatch.c
index ed3861571d9b..1271e9ab1039 100644
--- a/tests/longmatch.c
+++ b/tests/longmatch.c
@@ -17,25 +17,25 @@
 #define ZSTD_STATIC_LINKING_ONLY
 #include "zstd.h"
 
-int compress(ZSTD_CStream *ctx, ZSTD_outBuffer out, const void *data, size_t size) {
+static int
+compress(ZSTD_CStream *ctx, ZSTD_outBuffer out, const void *data, size_t size)
+{
   ZSTD_inBuffer in = { data, size, 0 };
   while (in.pos < in.size) {
     ZSTD_outBuffer tmp = out;
     const size_t rc = ZSTD_compressStream(ctx, &tmp, &in);
-    if (ZSTD_isError(rc)) {
-      return 1;
-    }
+    if (ZSTD_isError(rc)) return 1;
   }
-  {
-    ZSTD_outBuffer tmp = out;
+  { ZSTD_outBuffer tmp = out;
     const size_t rc = ZSTD_flushStream(ctx, &tmp);
     if (rc != 0) { return 1; }
   }
   return 0;
 }
 
-int main(int argc, const char** argv) {
-  ZSTD_CStream *ctx;
+int main(int argc, const char** argv)
+{
+  ZSTD_CStream* ctx;
   ZSTD_parameters params;
   size_t rc;
   unsigned windowLog;
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index 13b102b2d042..7a4be854a46b 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -17,7 +17,7 @@
 #include <stdio.h>     /* fprintf, fopen, ftello64 */
 #include <string.h>    /* strcmp */
 #include <math.h>      /* log */
-#include <time.h>
+#include <assert.h>
 
 #include "mem.h"
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters, ZSTD_estimateCCtxSize */
@@ -25,6 +25,9 @@
 #include "datagen.h"
 #include "xxhash.h"
 #include "util.h"
+#include "bench.h"
+#include "zstd_errors.h"
+#include "zstd_internal.h"     /* should not be needed */
 
 
 /*-************************************
@@ -32,25 +35,15 @@
 **************************************/
 #define PROGRAM_DESCRIPTION "ZSTD parameters tester"
 #define AUTHOR "Yann Collet"
-#define WELCOME_MESSAGE "*** %s %s %i-bits, by %s (%s) ***\n", PROGRAM_DESCRIPTION, ZSTD_VERSION_STRING, (int)(sizeof(void*)*8), AUTHOR, __DATE__
+#define WELCOME_MESSAGE "*** %s %s %i-bits, by %s ***\n", PROGRAM_DESCRIPTION, ZSTD_VERSION_STRING, (int)(sizeof(void*)*8), AUTHOR
 
-
-#define KB *(1<<10)
-#define MB *(1<<20)
-#define GB *(1ULL<<30)
-
-#define NBLOOPS    2
-#define TIMELOOP  (2 * SEC_TO_MICRO)
-
-#define NB_LEVELS_TRACKED 30
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+#define NB_LEVELS_TRACKED 22   /* ensured being >= ZSTD_maxCLevel() in BMK_init_level_constraints() */
 
 static const size_t maxMemory = (sizeof(size_t)==4)  ?  (2 GB - 64 MB) : (size_t)(1ULL << ((sizeof(size_t)*8)-31));
 
 #define COMPRESSIBILITY_DEFAULT 0.50
-static const size_t sampleSize = 10000000;
 
-static const double g_grillDuration_s = 90000;   /* about 24 hours */
-static const U64 g_maxParamTime = 15 * SEC_TO_MICRO;
 static const U64 g_maxVariationTime = 60 * SEC_TO_MICRO;
 static const int g_maxNbVariations = 64;
 
@@ -59,38 +52,301 @@ static const int g_maxNbVariations = 64;
 *  Macros
 **************************************/
 #define DISPLAY(...)  fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(n, ...) if(g_displayLevel >= n) { fprintf(stderr, __VA_ARGS__); }
+#define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
+
+#define TIMED 0
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
 
 #undef MIN
 #undef MAX
 #define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
 #define MAX(a,b)   ( (a) > (b) ? (a) : (b) )
+#define CUSTOM_LEVEL 99
+#define BASE_CLEVEL 1
+
+#define FADT_MIN 0
+#define FADT_MAX ((U32)-1)
+
+#define WLOG_RANGE (ZSTD_WINDOWLOG_MAX - ZSTD_WINDOWLOG_MIN + 1)
+#define CLOG_RANGE (ZSTD_CHAINLOG_MAX - ZSTD_CHAINLOG_MIN + 1)
+#define HLOG_RANGE (ZSTD_HASHLOG_MAX - ZSTD_HASHLOG_MIN + 1)
+#define SLOG_RANGE (ZSTD_SEARCHLOG_MAX - ZSTD_SEARCHLOG_MIN + 1)
+#define SLEN_RANGE (ZSTD_SEARCHLENGTH_MAX - ZSTD_SEARCHLENGTH_MIN + 1)
+#define TLEN_RANGE 17
+#define STRT_RANGE (ZSTD_btultra - ZSTD_fast + 1)
+#define FADT_RANGE 3
+
+#define CHECKTIME(r) { if(BMK_timeSpan(g_time) > g_timeLimit_s) { DEBUGOUTPUT("Time Limit Reached\n"); return r; } }
+#define CHECKTIMEGT(ret, val, _gototag) {if(BMK_timeSpan(g_time) > g_timeLimit_s) { DEBUGOUTPUT("Time Limit Reached\n"); ret = val; goto _gototag; } }
+
+#define PARAM_UNSET ((U32)-2) /* can't be -1 b/c fadt uses -1 */
+
+static const char* g_stratName[ZSTD_btultra+1] = {
+                "(none)       ", "ZSTD_fast    ", "ZSTD_dfast   ",
+                "ZSTD_greedy  ", "ZSTD_lazy    ", "ZSTD_lazy2   ",
+                "ZSTD_btlazy2 ", "ZSTD_btopt   ", "ZSTD_btultra "};
+
+static const U32 tlen_table[TLEN_RANGE] = { 0, 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 256, 512, 999 };
 
 
 /*-************************************
-*  Benchmark Parameters
+*  Setup for Adding new params
 **************************************/
-static U32 g_nbIterations = NBLOOPS;
-static double g_compressibility = COMPRESSIBILITY_DEFAULT;
-static U32 g_blockSize = 0;
-static U32 g_rand = 1;
-static U32 g_singleRun = 0;
-static U32 g_target = 0;
-static U32 g_noSeed = 0;
-static ZSTD_compressionParameters g_params = { 0, 0, 0, 0, 0, 0, ZSTD_greedy };
 
-void BMK_SetNbIterations(int nbLoops)
-{
-    g_nbIterations = nbLoops;
-    DISPLAY("- %u iterations -\n", g_nbIterations);
+/* indices for each of the variables */
+typedef enum {
+    wlog_ind = 0,
+    clog_ind = 1,
+    hlog_ind = 2,
+    slog_ind = 3,
+    slen_ind = 4,
+    tlen_ind = 5,
+    strt_ind = 6,
+    fadt_ind = 7, /* forceAttachDict */
+    NUM_PARAMS = 8
+} varInds_t;
+
+typedef struct {
+    U32 vals[NUM_PARAMS];
+} paramValues_t;
+
+/* maximum value of parameters */
+static const U32 mintable[NUM_PARAMS] =
+        { ZSTD_WINDOWLOG_MIN, ZSTD_CHAINLOG_MIN, ZSTD_HASHLOG_MIN, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLENGTH_MIN, ZSTD_TARGETLENGTH_MIN, ZSTD_fast, FADT_MIN };
+
+/* minimum value of parameters */
+static const U32 maxtable[NUM_PARAMS] =
+        { ZSTD_WINDOWLOG_MAX, ZSTD_CHAINLOG_MAX, ZSTD_HASHLOG_MAX, ZSTD_SEARCHLOG_MAX, ZSTD_SEARCHLENGTH_MAX, ZSTD_TARGETLENGTH_MAX, ZSTD_btultra, FADT_MAX };
+
+/* # of values parameters can take on */
+static const U32 rangetable[NUM_PARAMS] =
+        { WLOG_RANGE, CLOG_RANGE, HLOG_RANGE, SLOG_RANGE, SLEN_RANGE, TLEN_RANGE, STRT_RANGE, FADT_RANGE };
+
+/* ZSTD_cctxSetParameter() index to set */
+static const ZSTD_cParameter cctxSetParamTable[NUM_PARAMS] =
+        { ZSTD_p_windowLog, ZSTD_p_chainLog, ZSTD_p_hashLog, ZSTD_p_searchLog, ZSTD_p_minMatch, ZSTD_p_targetLength, ZSTD_p_compressionStrategy, ZSTD_p_forceAttachDict };
+
+/* names of parameters */
+static const char* g_paramNames[NUM_PARAMS] =
+        { "windowLog", "chainLog", "hashLog","searchLog", "searchLength", "targetLength", "strategy", "forceAttachDict" };
+
+/* shortened names of parameters */
+static const char* g_shortParamNames[NUM_PARAMS] =
+        { "wlog", "clog", "hlog","slog", "slen", "tlen", "strt", "fadt" };
+
+/* maps value from { 0 to rangetable[param] - 1 } to valid paramvalues */
+static U32 rangeMap(varInds_t param, int ind) {
+    ind = MAX(MIN(ind, (int)rangetable[param] - 1), 0);
+    switch(param) {
+        case tlen_ind:
+            return tlen_table[ind];
+        case fadt_ind: /* 0, 1, 2 -> -1, 0, 1 */
+            return ind - 1;
+        case wlog_ind: /* using default: triggers -Wswitch-enum */
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
+        case strt_ind:
+            return mintable[param] + ind;
+        case NUM_PARAMS:
+            DISPLAY("Error, not a valid param\n ");
+            return (U32)-1;
+    }
+    return 0; /* should never happen, stop compiler warnings */
+}
+
+/* inverse of rangeMap */
+static int invRangeMap(varInds_t param, U32 value) {
+    value = MIN(MAX(mintable[param], value), maxtable[param]);
+    switch(param) {
+        case tlen_ind: /* bin search */
+        {
+            int lo = 0;
+            int hi = TLEN_RANGE;
+            while(lo < hi) {
+                int mid = (lo + hi) / 2;
+                if(tlen_table[mid] < value) {
+                    lo = mid + 1;
+                } if(tlen_table[mid] == value) {
+                    return mid;
+                } else {
+                    hi = mid;
+                }
+            }
+            return lo;
+        }
+        case fadt_ind:
+            return (int)value + 1;
+        case wlog_ind:
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
+        case strt_ind:
+            return value - mintable[param];
+        case NUM_PARAMS:
+            DISPLAY("Error, not a valid param\n ");
+            return -2;
+    }
+    return 0; /* should never happen, stop compiler warnings */
+}
+
+/* display of params */
+static void displayParamVal(FILE* f, varInds_t param, U32 value, int width) {
+    switch(param) {
+        case fadt_ind: if(width) { fprintf(f, "%*d", width, (int)value); } else { fprintf(f, "%d", (int)value); } break;
+        case strt_ind: if(width) { fprintf(f, "%*s", width, g_stratName[value]); } else { fprintf(f, "%s", g_stratName[value]); } break;
+        case wlog_ind:
+        case clog_ind:
+        case hlog_ind:
+        case slog_ind:
+        case slen_ind:
+        case tlen_ind: if(width) { fprintf(f, "%*u", width, value); } else { fprintf(f, "%u", value); } break;
+        case NUM_PARAMS:
+            DISPLAY("Error, not a valid param\n "); break;
+    }
 }
 
 
+/*-************************************
+*  Benchmark Parameters/Global Variables
+**************************************/
+
+typedef BYTE U8;
+
+/* General Utility */
+static U32 g_timeLimit_s = 99999;   /* about 27 hours */
+static UTIL_time_t g_time; /* to be used to compare solution finding speeds to compare to original */
+static U32 g_blockSize = 0;
+static U32 g_rand = 1;
+
+/* Display */
+static int g_displayLevel = 3;
+static BYTE g_silenceParams[NUM_PARAMS];
+
+/* Mode Selection */
+static U32 g_singleRun = 0;
+static U32 g_optimizer = 0;
+static int g_optmode = 0;
+
+/* For cLevel Table generation */
+static U32 g_target = 0;
+static U32 g_noSeed = 0;
+
+/* For optimizer */
+static paramValues_t g_params; /* Initialized at the beginning of main w/ emptyParams() function */
+static double g_ratioMultiplier = 5.;
+static U32 g_strictness = PARAM_UNSET; /* range 1 - 100, measure of how strict  */
+static BMK_benchResult_t g_lvltarget;
+
+typedef enum {
+    directMap,
+    xxhashMap,
+    noMemo
+} memoTableType_t;
+
+typedef struct {
+    memoTableType_t tableType;
+    BYTE* table;
+    size_t tableLen;
+    varInds_t varArray[NUM_PARAMS];
+    size_t varLen;
+} memoTable_t;
+
+typedef struct {
+    BMK_benchResult_t result;
+    paramValues_t params;
+} winnerInfo_t;
+
+typedef struct {
+    U32 cSpeed;  /* bytes / sec */
+    U32 dSpeed;
+    U32 cMem;    /* bytes */
+} constraint_t;
+
+typedef struct winner_ll_node winner_ll_node;
+struct winner_ll_node {
+    winnerInfo_t res;
+    winner_ll_node* next;
+};
+
+static winner_ll_node* g_winners; /* linked list sorted ascending by cSize & cSpeed */
+
+/*
+ * Additional Global Variables (Defined Above Use)
+ * g_level_constraint
+ * g_alreadyTested
+ * g_maxTries
+ * g_clockGranularity
+ */
+
+
 /*-*******************************************************
-*  Private functions
+*  General Util Functions
 *********************************************************/
 
-/* accuracy in seconds only, span can be multiple years */
-static double BMK_timeSpan(time_t tStart) { return difftime(time(NULL), tStart); }
+/* nullified useless params, to ensure count stats */
+/* cleans up params for memoizing / display */
+static paramValues_t sanitizeParams(paramValues_t params)
+{
+    if (params.vals[strt_ind] == ZSTD_fast)
+        params.vals[clog_ind] = 0, params.vals[slog_ind] = 0;
+    if (params.vals[strt_ind] == ZSTD_dfast)
+        params.vals[slog_ind] = 0;
+    if (params.vals[strt_ind] != ZSTD_btopt && params.vals[strt_ind] != ZSTD_btultra && params.vals[strt_ind] != ZSTD_fast)
+        params.vals[tlen_ind] = 0;
+
+    return params;
+}
+
+static ZSTD_compressionParameters pvalsToCParams(paramValues_t p) {
+    ZSTD_compressionParameters c;
+    memset(&c, 0, sizeof(ZSTD_compressionParameters));
+    c.windowLog = p.vals[wlog_ind];
+    c.chainLog = p.vals[clog_ind];
+    c.hashLog = p.vals[hlog_ind];
+    c.searchLog = p.vals[slog_ind];
+    c.searchLength = p.vals[slen_ind];
+    c.targetLength = p.vals[tlen_ind];
+    c.strategy = p.vals[strt_ind];
+    /* no forceAttachDict */
+    return c;
+}
+
+static paramValues_t cParamsToPVals(ZSTD_compressionParameters c) {
+    paramValues_t p;
+    varInds_t i;
+    p.vals[wlog_ind] = c.windowLog;
+    p.vals[clog_ind] = c.chainLog;
+    p.vals[hlog_ind] = c.hashLog;
+    p.vals[slog_ind] = c.searchLog;
+    p.vals[slen_ind] = c.searchLength;
+    p.vals[tlen_ind] = c.targetLength;
+    p.vals[strt_ind] = c.strategy;
+
+    /* set all other params to their minimum value */
+    for(i = strt_ind + 1; i < NUM_PARAMS; i++) {
+        p.vals[i] = mintable[i];
+    }
+    return p;
+}
+
+/* equivalent of ZSTD_adjustCParams for paramValues_t */
+static paramValues_t adjustParams(paramValues_t p, const size_t maxBlockSize, const size_t dictSize) {
+    paramValues_t ot = p;
+    varInds_t i;
+    p = cParamsToPVals(ZSTD_adjustCParams(pvalsToCParams(p), maxBlockSize, dictSize));
+    if(!dictSize) { p.vals[fadt_ind] = 0; }
+    /* retain value of all other parameters */
+    for(i = strt_ind + 1; i < NUM_PARAMS; i++) {
+        p.vals[i] = ot.vals[i];
+    }
+    return p;
+}
 
 static size_t BMK_findMaxMem(U64 requiredMem)
 {
@@ -100,23 +356,25 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     requiredMem = (((requiredMem >> 26) + 1) << 26);
     if (requiredMem > maxMemory) requiredMem = maxMemory;
 
-    requiredMem += 2*step;
-    while (!testmem) {
-        requiredMem -= step;
+    requiredMem += 2 * step;
+    while (!testmem && requiredMem > 0) {
         testmem = malloc ((size_t)requiredMem);
+        requiredMem -= step;
     }
 
     free (testmem);
-    return (size_t) (requiredMem - step);
+    return (size_t) requiredMem;
 }
 
+/* accuracy in seconds only, span can be multiple years */
+static U32 BMK_timeSpan(const UTIL_time_t tStart) { return (U32)(UTIL_clockSpanMicro(tStart) / 1000000ULL); }
 
 static U32 FUZ_rotl32(U32 x, U32 r)
 {
     return ((x << r) | (x >> (32 - r)));
 }
 
-U32 FUZ_rand(U32* src)
+static U32 FUZ_rand(U32* src)
 {
     const U32 prime1 = 2654435761U;
     const U32 prime2 = 2246822519U;
@@ -128,221 +386,489 @@ U32 FUZ_rand(U32* src)
     return rand32 >> 5;
 }
 
+/* allows zeros */
+#define CLAMPCHECK(val,min,max) {                     \
+    if (((val)<(min)) | ((val)>(max))) {              \
+        DISPLAY("INVALID PARAMETER CONSTRAINTS\n");   \
+        return 0;                                     \
+}   }
 
-/*-*******************************************************
-*  Bench functions
-*********************************************************/
-typedef struct {
-    size_t cSize;
-    double cSpeed;   /* bytes / sec */
-    double dSpeed;
-} BMK_result_t;
+static int paramValid(const paramValues_t paramTarget) {
+    U32 i;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        CLAMPCHECK(paramTarget.vals[i], mintable[i], maxtable[i]);
+    }
+    return 1;
+}
 
-typedef struct
+static paramValues_t cParamUnsetMin(paramValues_t paramTarget) {
+    varInds_t i;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(paramTarget.vals[i] == PARAM_UNSET) {
+            paramTarget.vals[i] = mintable[i];
+        }
+    }
+    return paramTarget;
+}
+
+static paramValues_t emptyParams(void) {
+    U32 i;
+    paramValues_t p;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        p.vals[i] = PARAM_UNSET;
+    }
+    return p;
+}
+
+static winnerInfo_t initWinnerInfo(const paramValues_t p) {
+    winnerInfo_t w1;
+    w1.result.cSpeed = 0.;
+    w1.result.dSpeed = 0.;
+    w1.result.cMem = (size_t)-1;
+    w1.result.cSize = (size_t)-1;
+    w1.params = p;
+    return w1;
+}
+
+static paramValues_t overwriteParams(paramValues_t base, const paramValues_t mask) {
+    U32 i;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(mask.vals[i] != PARAM_UNSET) {
+            base.vals[i] = mask.vals[i];
+        }
+    }
+    return base;
+}
+
+static void paramVaryOnce(const varInds_t paramIndex, const int amt, paramValues_t* ptr) {
+    ptr->vals[paramIndex] = rangeMap(paramIndex, invRangeMap(paramIndex, ptr->vals[paramIndex]) + amt);
+}
+
+/* varies ptr by nbChanges respecting varyParams*/
+static void paramVariation(paramValues_t* ptr, memoTable_t* mtAll, const U32 nbChanges)
 {
-    const char* srcPtr;
-    size_t srcSize;
-    char*  cPtr;
-    size_t cRoom;
-    size_t cSize;
-    char*  resPtr;
-    size_t resSize;
-} blockParam_t;
+    paramValues_t p;
+    U32 validated = 0;
+    while (!validated) {
+        U32 i;
+        p = *ptr;
+        for (i = 0 ; i < nbChanges ; i++) {
+            const U32 changeID = (U32)FUZ_rand(&g_rand) % (mtAll[p.vals[strt_ind]].varLen << 1);
+            paramVaryOnce(mtAll[p.vals[strt_ind]].varArray[changeID >> 1], ((changeID & 1) << 1) - 1, &p);
+        }
+        validated = paramValid(p);
+    }
+    *ptr = p;
+}
 
-
-static size_t BMK_benchParam(BMK_result_t* resultPtr,
-                             const void* srcBuffer, size_t srcSize,
-                             ZSTD_CCtx* ctx,
-                             const ZSTD_compressionParameters cParams)
+/* Completely random parameter selection */
+static paramValues_t randomParams(void)
 {
-    const size_t blockSize = g_blockSize ? g_blockSize : srcSize;
-    const U32 nbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize);
-    blockParam_t* const blockTable = (blockParam_t*) malloc(nbBlocks * sizeof(blockParam_t));
-    const size_t maxCompressedSize = (size_t)nbBlocks * ZSTD_compressBound(blockSize);
-    void* const compressedBuffer = malloc(maxCompressedSize);
-    void* const resultBuffer = malloc(srcSize);
-    ZSTD_parameters params;
-    U32 Wlog = cParams.windowLog;
-    U32 Clog = cParams.chainLog;
-    U32 Hlog = cParams.hashLog;
-    U32 Slog = cParams.searchLog;
-    U32 Slength = cParams.searchLength;
-    U32 Tlength = cParams.targetLength;
-    ZSTD_strategy strat = cParams.strategy;
-    char name[30] = { 0 };
-    U64 crcOrig;
+    varInds_t v; paramValues_t p;
+    for(v = 0; v < NUM_PARAMS; v++) {
+        p.vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
+    }
+    return p;
+}
 
-    /* init result for early exit */
-    resultPtr->cSize = srcSize;
-    resultPtr->cSpeed = 0.;
-    resultPtr->dSpeed = 0.;
+static U64 g_clockGranularity = 100000000ULL;
 
-    /* Memory allocation & restrictions */
-    snprintf(name, 30, "Sw%02uc%02uh%02us%02ul%1ut%03uS%1u", Wlog, Clog, Hlog, Slog, Slength, Tlength, strat);
-    if (!compressedBuffer || !resultBuffer || !blockTable) {
-        DISPLAY("\nError: not enough memory!\n");
-        free(compressedBuffer);
-        free(resultBuffer);
-        free(blockTable);
-        return 12;
+static void findClockGranularity(void) {
+    UTIL_time_t clockStart = UTIL_getTime();
+    U64 el1 = 0, el2 = 0;
+    int i = 0;
+    do {
+        el1 = el2;
+        el2 = UTIL_clockSpanNano(clockStart);
+        if(el1 < el2) {
+            U64 iv = el2 - el1;
+            if(g_clockGranularity > iv) {
+                g_clockGranularity = iv;
+                i = 0;
+            } else {
+                i++;
+            }
+        }
+    } while(i < 10);
+    DEBUGOUTPUT("Granularity: %llu\n", (unsigned long long)g_clockGranularity);
+}
+
+/*-************************************
+*  Optimizer Util Functions
+**************************************/
+
+/* checks results are feasible */
+static int feasible(const BMK_benchResult_t results, const constraint_t target) {
+    return (results.cSpeed >= target.cSpeed)
+        && (results.dSpeed >= target.dSpeed)
+        && (results.cMem <= target.cMem)
+        && (!g_optmode || results.cSize <= g_lvltarget.cSize);
+}
+
+/* hill climbing value for part 1 */
+/* Scoring here is a linear reward for all set constraints normalized between 0 to 1
+ * (with 0 at 0 and 1 being fully fulfilling the constraint), summed with a logarithmic
+ * bonus to exceeding the constraint value. We also give linear ratio for compression ratio.
+ * The constant factors are experimental.
+ */
+static double resultScore(const BMK_benchResult_t res, const size_t srcSize, const constraint_t target) {
+    double cs = 0., ds = 0., rt, cm = 0.;
+    const double r1 = 1, r2 = 0.1, rtr = 0.5;
+    double ret;
+    if(target.cSpeed) { cs = res.cSpeed / (double)target.cSpeed; }
+    if(target.dSpeed) { ds = res.dSpeed / (double)target.dSpeed; }
+    if(target.cMem != (U32)-1) { cm = (double)target.cMem / res.cMem; }
+    rt = ((double)srcSize / res.cSize);
+
+    ret = (MIN(1, cs) + MIN(1, ds)  + MIN(1, cm))*r1 + rt * rtr +
+         (MAX(0, log(cs))+ MAX(0, log(ds))+ MAX(0, log(cm))) * r2;
+
+    return ret;
+}
+
+/* calculates normalized squared euclidean distance of result1 if it is in the first quadrant relative to lvlRes */
+static double resultDistLvl(const BMK_benchResult_t result1, const BMK_benchResult_t lvlRes) {
+    double normalizedCSpeedGain1 = (result1.cSpeed / lvlRes.cSpeed) - 1;
+    double normalizedRatioGain1 = ((double)lvlRes.cSize / result1.cSize) - 1;
+    if(normalizedRatioGain1 < 0 || normalizedCSpeedGain1 < 0) {
+        return 0.0;
+    }
+    return normalizedRatioGain1 * g_ratioMultiplier + normalizedCSpeedGain1;
+}
+
+/* return true if r2 strictly better than r1 */
+static int compareResultLT(const BMK_benchResult_t result1, const BMK_benchResult_t result2, const constraint_t target, size_t srcSize) {
+    if(feasible(result1, target) && feasible(result2, target)) {
+        if(g_optmode) {
+            return resultDistLvl(result1, g_lvltarget) < resultDistLvl(result2, g_lvltarget);
+        } else {
+            return (result1.cSize > result2.cSize) || (result1.cSize == result2.cSize && result2.cSpeed > result1.cSpeed)
+            || (result1.cSize == result2.cSize && result2.cSpeed == result1.cSpeed && result2.dSpeed > result1.dSpeed);
+        }
+    }
+    return feasible(result2, target) || (!feasible(result1, target) && (resultScore(result1, srcSize, target) < resultScore(result2, srcSize, target)));
+}
+
+static constraint_t relaxTarget(constraint_t target) {
+    target.cMem = (U32)-1;
+    target.cSpeed *= ((double)g_strictness) / 100;
+    target.dSpeed *= ((double)g_strictness) / 100;
+    return target;
+}
+
+static void optimizerAdjustInput(paramValues_t* pc, const size_t maxBlockSize) {
+    varInds_t v;
+    for(v = 0; v < NUM_PARAMS; v++) {
+        if(pc->vals[v] != PARAM_UNSET) {
+            U32 newval = MIN(MAX(pc->vals[v], mintable[v]), maxtable[v]);
+            if(newval != pc->vals[v]) {
+                pc->vals[v] = newval;
+                DISPLAY("Warning: parameter %s not in valid range, adjusting to ", g_paramNames[v]); displayParamVal(stderr, v, newval, 0); DISPLAY("\n");
+            }
+        }
     }
 
-    /* Calculating input Checksum */
-    crcOrig = XXH64(srcBuffer, srcSize, 0);
+    if(pc->vals[wlog_ind] != PARAM_UNSET) {
 
-    /* Init blockTable data */
-    {
-        U32 i;
-        size_t remaining = srcSize;
-        const char* srcPtr = (const char*)srcBuffer;
-        char* cPtr = (char*)compressedBuffer;
-        char* resPtr = (char*)resultBuffer;
-        for (i=0; i<nbBlocks; i++) {
-            size_t thisBlockSize = MIN(remaining, blockSize);
-            blockTable[i].srcPtr = srcPtr;
-            blockTable[i].cPtr = cPtr;
-            blockTable[i].resPtr = resPtr;
-            blockTable[i].srcSize = thisBlockSize;
-            blockTable[i].cRoom = ZSTD_compressBound(thisBlockSize);
-            srcPtr += thisBlockSize;
-            cPtr += blockTable[i].cRoom;
-            resPtr += thisBlockSize;
-            remaining -= thisBlockSize;
-    }   }
+        U32 sshb = maxBlockSize > 1 ? ZSTD_highbit32((U32)(maxBlockSize-1)) + 1 : 1;
+        /* edge case of highBit not working for 0 */
 
-    /* warmimg up memory */
-    RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.10, 1);
-
-    /* Bench */
-    {   U32 loopNb;
-        size_t cSize = 0;
-        double fastestC = 100000000., fastestD = 100000000.;
-        double ratio = 0.;
-        UTIL_time_t const benchStart = UTIL_getTime();
-
-        DISPLAY("\r%79s\r", "");
-        memset(&params, 0, sizeof(params));
-        params.cParams = cParams;
-        for (loopNb = 1; loopNb <= g_nbIterations; loopNb++) {
-            int nbLoops;
-            U32 blockNb;
-            UTIL_time_t roundStart;
-            U64 roundClock;
-
-            { U64 const benchTime = UTIL_clockSpanMicro(benchStart);
-              if (benchTime > g_maxParamTime) break; }
-
-            /* Compression */
-            DISPLAY("\r%1u-%s : %9u ->", loopNb, name, (U32)srcSize);
-            memset(compressedBuffer, 0xE5, maxCompressedSize);
-
-            nbLoops = 0;
-            UTIL_waitForNextTick();
-            roundStart = UTIL_getTime();
-            while (UTIL_clockSpanMicro(roundStart) < TIMELOOP) {
-                for (blockNb=0; blockNb<nbBlocks; blockNb++)
-                    blockTable[blockNb].cSize = ZSTD_compress_advanced(ctx,
-                                                    blockTable[blockNb].cPtr,  blockTable[blockNb].cRoom,
-                                                    blockTable[blockNb].srcPtr, blockTable[blockNb].srcSize,
-                                                    NULL, 0,
-                                                    params);
-                nbLoops++;
+        if(maxBlockSize < (1ULL << 31) && sshb + 1 < pc->vals[wlog_ind]) {
+            U32 adjust = MAX(mintable[wlog_ind], sshb);
+            if(adjust != pc->vals[wlog_ind]) {
+                pc->vals[wlog_ind] = adjust;
+                DISPLAY("Warning: windowLog larger than src/block size, adjusted to %u\n", pc->vals[wlog_ind]);
             }
-            roundClock = UTIL_clockSpanMicro(roundStart);
+        }
+    }
 
-            cSize = 0;
-            for (blockNb=0; blockNb<nbBlocks; blockNb++)
-                cSize += blockTable[blockNb].cSize;
-            ratio = (double)srcSize / (double)cSize;
-            if ((double)roundClock < fastestC * SEC_TO_MICRO * nbLoops) fastestC = ((double)roundClock / SEC_TO_MICRO) / nbLoops;
-            DISPLAY("\r");
-            DISPLAY("%1u-%s : %9u ->", loopNb, name, (U32)srcSize);
-            DISPLAY(" %9u (%4.3f),%7.1f MB/s", (U32)cSize, ratio, (double)srcSize / fastestC / 1000000.);
-            resultPtr->cSize = cSize;
-            resultPtr->cSpeed = (double)srcSize / fastestC;
+    if(pc->vals[wlog_ind] != PARAM_UNSET && pc->vals[clog_ind] != PARAM_UNSET) {
+        U32 maxclog;
+        if(pc->vals[strt_ind] == PARAM_UNSET || pc->vals[strt_ind] >= (U32)ZSTD_btlazy2) {
+            maxclog = pc->vals[wlog_ind] + 1;
+        } else {
+            maxclog = pc->vals[wlog_ind];
+        }
 
-#if 1
-            /* Decompression */
-            memset(resultBuffer, 0xD6, srcSize);
+        if(pc->vals[clog_ind] > maxclog) {
+            pc->vals[clog_ind] = maxclog;
+            DISPLAY("Warning: chainlog too much larger than windowLog size, adjusted to %u\n", pc->vals[clog_ind]);
+        }
+    }
 
-            nbLoops = 0;
-            UTIL_waitForNextTick();
-            roundStart = UTIL_getTime();
-            for ( ; UTIL_clockSpanMicro(roundStart) < TIMELOOP; nbLoops++) {
-                for (blockNb=0; blockNb<nbBlocks; blockNb++)
-                    blockTable[blockNb].resSize = ZSTD_decompress(blockTable[blockNb].resPtr, blockTable[blockNb].srcSize,
-                                                                  blockTable[blockNb].cPtr, blockTable[blockNb].cSize);
-            }
-            roundClock = UTIL_clockSpanMicro(roundStart);
+    if(pc->vals[wlog_ind] != PARAM_UNSET && pc->vals[hlog_ind] != PARAM_UNSET) {
+        if(pc->vals[wlog_ind] + 1 < pc->vals[hlog_ind]) {
+            pc->vals[hlog_ind] = pc->vals[wlog_ind] + 1;
+            DISPLAY("Warning: hashlog too much larger than windowLog size, adjusted to %u\n", pc->vals[hlog_ind]);
+        }
+    }
 
-            if ((double)roundClock < fastestD * SEC_TO_MICRO * nbLoops) fastestD = ((double)roundClock / SEC_TO_MICRO) / nbLoops;
-            DISPLAY("\r");
-            DISPLAY("%1u-%s : %9u -> ", loopNb, name, (U32)srcSize);
-            DISPLAY("%9u (%4.3f),%7.1f MB/s, ", (U32)cSize, ratio, (double)srcSize / fastestC / 1000000.);
-            DISPLAY("%7.1f MB/s", (double)srcSize / fastestD / 1000000.);
-            resultPtr->dSpeed = (double)srcSize / fastestD;
-
-            /* CRC Checking */
-            {   U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
-                if (crcOrig!=crcCheck) {
-                    unsigned u;
-                    unsigned eBlockSize = (unsigned)(MIN(65536*2, blockSize));
-                    DISPLAY("\n!!! WARNING !!! Invalid Checksum : %x != %x\n", (unsigned)crcOrig, (unsigned)crcCheck);
-                    for (u=0; u<srcSize; u++) {
-                        if (((const BYTE*)srcBuffer)[u] != ((BYTE*)resultBuffer)[u]) {
-                            printf("Decoding error at pos %u (block %u, pos %u) \n", u, u / eBlockSize, u % eBlockSize);
-                            break;
-                    }   }
-                    break;
-            }   }
-#endif
-    }   }
-
-    /* End cleaning */
-    DISPLAY("\r");
-    free(compressedBuffer);
-    free(resultBuffer);
-    return 0;
+    if(pc->vals[slog_ind] != PARAM_UNSET && pc->vals[clog_ind] != PARAM_UNSET) {
+        if(pc->vals[slog_ind] > pc->vals[clog_ind]) {
+            pc->vals[clog_ind] = pc->vals[slog_ind];
+            DISPLAY("Warning: searchLog larger than chainLog, adjusted to %u\n", pc->vals[slog_ind]);
+        }
+    }
 }
 
+static int redundantParams(const paramValues_t paramValues, const constraint_t target, const size_t maxBlockSize) {
+    return
+       (ZSTD_estimateCStreamSize_usingCParams(pvalsToCParams(paramValues)) > (size_t)target.cMem) /* Uses too much memory */
+    || ((1ULL << (paramValues.vals[wlog_ind] - 1)) >= maxBlockSize && paramValues.vals[wlog_ind] != mintable[wlog_ind]) /* wlog too much bigger than src size */
+    || (paramValues.vals[clog_ind] > (paramValues.vals[wlog_ind] + (paramValues.vals[strt_ind] > ZSTD_btlazy2))) /* chainLog larger than windowLog*/
+    || (paramValues.vals[slog_ind] > paramValues.vals[clog_ind]) /* searchLog larger than chainLog */
+    || (paramValues.vals[hlog_ind] > paramValues.vals[wlog_ind] + 1); /* hashLog larger than windowLog + 1 */
 
-const char* g_stratName[ZSTD_btultra+1] = {
-                "(none)       ", "ZSTD_fast    ", "ZSTD_dfast   ",
-                "ZSTD_greedy  ", "ZSTD_lazy    ", "ZSTD_lazy2   ",
-                "ZSTD_btlazy2 ", "ZSTD_btopt   ", "ZSTD_btultra "};
+}
 
-static void BMK_printWinner(FILE* f, U32 cLevel, BMK_result_t result, ZSTD_compressionParameters params, size_t srcSize)
+/*-************************************
+*  Display Functions
+**************************************/
+
+static void BMK_translateAdvancedParams(FILE* f, const paramValues_t params) {
+    varInds_t v;
+    int first = 1;
+    fprintf(f,"--zstd=");
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
+        fprintf(f,"%s=", g_paramNames[v]);
+
+        if (v == strt_ind) { fprintf(f,"%u", params.vals[v]); }
+        else { displayParamVal(f, v, params.vals[v], 0); }
+        first = 0;
+    }
+    fprintf(f, "\n");
+}
+
+static void BMK_displayOneResult(FILE* f, winnerInfo_t res, const size_t srcSize)
 {
-    DISPLAY("\r%79s\r", "");
-    fprintf(f,"    {%3u,%3u,%3u,%3u,%3u,%3u, %s },  ",
-            params.windowLog, params.chainLog, params.hashLog, params.searchLog, params.searchLength,
-            params.targetLength, g_stratName[(U32)(params.strategy)]);
-    fprintf(f,
-            "/* level %2u */   /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
-            cLevel, (double)srcSize / result.cSize, result.cSpeed / 1000000., result.dSpeed / 1000000.);
+    varInds_t v;
+    int first = 1;
+    res.params = cParamUnsetMin(res.params);
+    fprintf(f, "    {");
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
+        displayParamVal(f, v, res.params.vals[v], 3);
+        first = 0;
+    }
+
+    {   double const ratio = res.result.cSize ?
+                            (double)srcSize / res.result.cSize : 0;
+        double const cSpeedMBps = (double)res.result.cSpeed / MB_UNIT;
+        double const dSpeedMBps = (double)res.result.dSpeed / MB_UNIT;
+
+        fprintf(f, " },     /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
+                            ratio, cSpeedMBps, dSpeedMBps);
+    }
 }
 
+/* Writes to f the results of a parameter benchmark */
+/* when used with --optimize, will only print results better than previously discovered */
+static void BMK_printWinner(FILE* f, const int cLevel, const BMK_benchResult_t result, const paramValues_t params, const size_t srcSize)
+{
+    char lvlstr[15] = "Custom Level";
+    winnerInfo_t w;
+    w.params = params;
+    w.result = result;
 
-static double g_cSpeedTarget[NB_LEVELS_TRACKED] = { 0. };   /* NB_LEVELS_TRACKED : checked at main() */
+    fprintf(f, "\r%79s\r", "");
 
-typedef struct {
-    BMK_result_t result;
-    ZSTD_compressionParameters params;
-} winnerInfo_t;
+    if(cLevel != CUSTOM_LEVEL) {
+        snprintf(lvlstr, 15, "  Level %2d  ", cLevel);
+    }
 
-static void BMK_printWinners2(FILE* f, const winnerInfo_t* winners, size_t srcSize)
+    if(TIMED) {
+        const U64 time = UTIL_clockSpanNano(g_time);
+        const U64 minutes = time / (60ULL * TIMELOOP_NANOSEC);
+        fprintf(f, "%1lu:%2lu:%05.2f - ", (unsigned long) minutes / 60,(unsigned long) minutes % 60, (double)(time - minutes * TIMELOOP_NANOSEC * 60ULL)/TIMELOOP_NANOSEC);
+    }
+
+    fprintf(f, "/* %s */   ", lvlstr);
+    BMK_displayOneResult(f, w, srcSize);
+}
+
+/* comparison function: */
+/* strictly better, strictly worse, equal, speed-side adv, size-side adv */
+#define WORSE_RESULT 0
+#define BETTER_RESULT 1
+#define ERROR_RESULT 2
+
+#define SPEED_RESULT 4
+#define SIZE_RESULT 5
+/* maybe have epsilon-eq to limit table size? */
+static int speedSizeCompare(const BMK_benchResult_t r1, const BMK_benchResult_t r2) {
+    if(r1.cSpeed < r2.cSpeed) {
+        if(r1.cSize >= r2.cSize) {
+            return BETTER_RESULT;
+        }
+        return SPEED_RESULT; /* r2 is smaller but not faster. */
+    } else {
+        if(r1.cSize <= r2.cSize) {
+            return WORSE_RESULT;
+        }
+        return SIZE_RESULT; /* r2 is faster but not smaller */
+    }
+}
+
+/* 0 for insertion, 1 for no insert */
+/* maintain invariant speedSizeCompare(n, n->next) = SPEED_RESULT */
+static int insertWinner(const winnerInfo_t w, const constraint_t targetConstraints) {
+    BMK_benchResult_t r = w.result;
+    winner_ll_node* cur_node = g_winners;
+    /* first node to insert */
+    if(!feasible(r, targetConstraints)) {
+        return 1;
+    }
+
+    if(g_winners == NULL) {
+        winner_ll_node* first_node = malloc(sizeof(winner_ll_node));
+        if(first_node == NULL) {
+            return 1;
+        }
+        first_node->next = NULL;
+        first_node->res = w;
+        g_winners = first_node;
+        return 0;
+    }
+
+    while(cur_node->next != NULL) {
+        switch(speedSizeCompare(cur_node->res.result, r)) {
+            case WORSE_RESULT:
+            {
+                return 1; /* never insert if better */
+            }
+            case BETTER_RESULT:
+            {
+                winner_ll_node* tmp;
+                cur_node->res = cur_node->next->res;
+                tmp = cur_node->next;
+                cur_node->next = cur_node->next->next;
+                free(tmp);
+                break;
+            }
+            case SIZE_RESULT:
+            {
+                cur_node = cur_node->next;
+                break;
+            }
+            case SPEED_RESULT: /* insert after first size result, then return */
+            {
+                winner_ll_node* newnode = malloc(sizeof(winner_ll_node));
+                if(newnode == NULL) {
+                    return 1;
+                }
+                newnode->res = cur_node->res;
+                cur_node->res = w;
+                newnode->next = cur_node->next;
+                cur_node->next = newnode;
+                return 0;
+            }
+        }
+
+    }
+
+    assert(cur_node->next == NULL);
+    switch(speedSizeCompare(cur_node->res.result, r)) {
+        case WORSE_RESULT:
+        {
+            return 1; /* never insert if better */
+        }
+        case BETTER_RESULT:
+        {
+            cur_node->res = w;
+            return 0;
+        }
+        case SIZE_RESULT:
+        {
+            winner_ll_node* newnode = malloc(sizeof(winner_ll_node));
+            if(newnode == NULL) {
+                return 1;
+            }
+            newnode->res = w;
+            newnode->next = NULL;
+            cur_node->next = newnode;
+            return 0;
+        }
+        case SPEED_RESULT: /* insert before first size result, then return */
+        {
+            winner_ll_node* newnode = malloc(sizeof(winner_ll_node));
+            if(newnode == NULL) {
+                return 1;
+            }
+            newnode->res = cur_node->res;
+            cur_node->res = w;
+            newnode->next = cur_node->next;
+            cur_node->next = newnode;
+            return 0;
+        }
+        default:
+            return 1;
+    }
+}
+
+static void BMK_printWinnerOpt(FILE* f, const U32 cLevel, const BMK_benchResult_t result, const paramValues_t params, const constraint_t targetConstraints, const size_t srcSize)
+{
+    /* global winner used for constraints */
+                                    /* cSize, cSpeed, dSpeed, cMem */
+    static winnerInfo_t g_winner = { { (size_t)-1LL, 0, 0, (size_t)-1LL }, { { PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET, PARAM_UNSET } } };
+    if(DEBUG || compareResultLT(g_winner.result, result, targetConstraints, srcSize) || g_displayLevel >= 4) {
+        if(DEBUG && compareResultLT(g_winner.result, result, targetConstraints, srcSize)) {
+            DISPLAY("New Winner: \n");
+        }
+
+        if(g_displayLevel >= 2) { BMK_printWinner(f, cLevel, result, params, srcSize); }
+
+        if(compareResultLT(g_winner.result, result, targetConstraints, srcSize)) {
+            if(g_displayLevel >= 1) { BMK_translateAdvancedParams(f, params); }
+            g_winner.result = result;
+            g_winner.params = params;
+        }
+    }
+
+    if(g_optmode && g_optimizer && (DEBUG || g_displayLevel == 3)) {
+        winnerInfo_t w;
+        winner_ll_node* n;
+        w.result = result;
+        w.params = params;
+        insertWinner(w, targetConstraints);
+
+        if(!DEBUG) { fprintf(f, "\033c"); }
+        fprintf(f, "\n");
+
+        /* the table */
+        fprintf(f, "================================\n");
+        for(n = g_winners; n != NULL; n = n->next) {
+            BMK_displayOneResult(f, n->res, srcSize);
+        }
+        fprintf(f, "================================\n");
+        fprintf(f, "Level Bounds: R: > %.3f AND C: < %.1f MB/s \n\n",
+            (double)srcSize / g_lvltarget.cSize, (double)g_lvltarget.cSpeed / MB_UNIT);
+
+
+        fprintf(f, "Overall Winner: \n");
+        BMK_displayOneResult(f, g_winner, srcSize);
+        BMK_translateAdvancedParams(f, g_winner.params);
+
+        fprintf(f, "Latest BMK: \n");\
+        BMK_displayOneResult(f, w, srcSize);
+    }
+}
+
+static void BMK_printWinners2(FILE* f, const winnerInfo_t* winners, const size_t srcSize)
 {
     int cLevel;
 
     fprintf(f, "\n /* Proposed configurations : */ \n");
     fprintf(f, "    /* W,  C,  H,  S,  L,  T, strat */ \n");
 
-    for (cLevel=0; cLevel <= ZSTD_maxCLevel(); cLevel++)
+    for (cLevel=0; cLevel <= NB_LEVELS_TRACKED; cLevel++)
         BMK_printWinner(f, cLevel, winners[cLevel].result, winners[cLevel].params, srcSize);
 }
 
 
-static void BMK_printWinners(FILE* f, const winnerInfo_t* winners, size_t srcSize)
+static void BMK_printWinners(FILE* f, const winnerInfo_t* winners, const size_t srcSize)
 {
     fseek(f, 0, SEEK_SET);
     BMK_printWinners2(f, winners, srcSize);
@@ -350,41 +876,837 @@ static void BMK_printWinners(FILE* f, const winnerInfo_t* winners, size_t srcSiz
     BMK_printWinners2(stdout, winners, srcSize);
 }
 
-static int BMK_seed(winnerInfo_t* winners, const ZSTD_compressionParameters params,
-              const void* srcBuffer, size_t srcSize,
-                    ZSTD_CCtx* ctx)
+
+/*-*******************************************************
+*  Functions to Benchmark
+*********************************************************/
+
+typedef struct {
+    ZSTD_CCtx* cctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+    int cLevel;
+    const paramValues_t* comprParams;
+} BMK_initCCtxArgs;
+
+static size_t local_initCCtx(void* payload) {
+    const BMK_initCCtxArgs* ag = (const BMK_initCCtxArgs*)payload;
+    varInds_t i;
+    ZSTD_CCtx_reset(ag->cctx);
+    ZSTD_CCtx_resetParameters(ag->cctx);
+    ZSTD_CCtx_setParameter(ag->cctx, ZSTD_p_compressionLevel, ag->cLevel);
+
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(ag->comprParams->vals[i] != PARAM_UNSET)
+        ZSTD_CCtx_setParameter(ag->cctx, cctxSetParamTable[i], ag->comprParams->vals[i]);
+    }
+    ZSTD_CCtx_loadDictionary(ag->cctx, ag->dictBuffer, ag->dictBufferSize);
+
+    return 0;
+}
+
+typedef struct {
+    ZSTD_DCtx* dctx;
+    const void* dictBuffer;
+    size_t dictBufferSize;
+} BMK_initDCtxArgs;
+
+static size_t local_initDCtx(void* payload) {
+    const BMK_initDCtxArgs* ag = (const BMK_initDCtxArgs*)payload;
+    ZSTD_DCtx_reset(ag->dctx);
+    ZSTD_DCtx_loadDictionary(ag->dctx, ag->dictBuffer, ag->dictBufferSize);
+    return 0;
+}
+
+/* additional argument is just the context */
+static size_t local_defaultCompress(
+    const void* srcBuffer, size_t srcSize,
+    void* dstBuffer, size_t dstSize,
+    void* addArgs) {
+    size_t moreToFlush = 1;
+    ZSTD_CCtx* ctx = (ZSTD_CCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer;
+    in.size = srcSize;
+    in.pos = 0;
+    out.dst = dstBuffer;
+    out.size = dstSize;
+    out.pos = 0;
+    assert(dstSize == ZSTD_compressBound(srcSize)); /* specific to this version, which is only used in paramgrill */
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_compress_generic(ctx, &out, &in, ZSTD_e_end);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+}
+
+/* additional argument is just the context */
+static size_t local_defaultDecompress(
+    const void* srcBuffer, size_t srcSize,
+    void* dstBuffer, size_t dstSize,
+    void* addArgs) {
+    size_t moreToFlush = 1;
+    ZSTD_DCtx* dctx = (ZSTD_DCtx*)addArgs;
+    ZSTD_inBuffer in;
+    ZSTD_outBuffer out;
+    in.src = srcBuffer;
+    in.size = srcSize;
+    in.pos = 0;
+    out.dst = dstBuffer;
+    out.size = dstSize;
+    out.pos = 0;
+    while (moreToFlush) {
+        if(out.pos == out.size) {
+            return (size_t)-ZSTD_error_dstSize_tooSmall;
+        }
+        moreToFlush = ZSTD_decompress_generic(dctx,
+                            &out, &in);
+        if (ZSTD_isError(moreToFlush)) {
+            return moreToFlush;
+        }
+    }
+    return out.pos;
+
+}
+
+/*-************************************
+*  Data Initialization Functions
+**************************************/
+
+typedef struct {
+    void* srcBuffer;
+    size_t srcSize;
+    const void** srcPtrs;
+    size_t* srcSizes;
+    void** dstPtrs;
+    size_t* dstCapacities;
+    size_t* dstSizes;
+    void** resPtrs;
+    size_t* resSizes;
+    size_t nbBlocks;
+    size_t maxBlockSize;
+} buffers_t;
+
+typedef struct {
+    size_t dictSize;
+    void* dictBuffer;
+    ZSTD_CCtx* cctx;
+    ZSTD_DCtx* dctx;
+} contexts_t;
+
+static void freeNonSrcBuffers(const buffers_t b) {
+    free(b.srcPtrs);
+    free(b.srcSizes);
+
+    if(b.dstPtrs != NULL) {
+        free(b.dstPtrs[0]);
+    }
+    free(b.dstPtrs);
+    free(b.dstCapacities);
+    free(b.dstSizes);
+
+    if(b.resPtrs != NULL) {
+        free(b.resPtrs[0]);
+    }
+    free(b.resPtrs);
+    free(b.resSizes);
+}
+
+static void freeBuffers(const buffers_t b) {
+    if(b.srcPtrs != NULL) {
+        free(b.srcBuffer);
+    }
+    freeNonSrcBuffers(b);
+}
+
+/* srcBuffer will be freed by freeBuffers now */
+static int createBuffersFromMemory(buffers_t* buff, void * srcBuffer, const size_t nbFiles,
+    const size_t* fileSizes)
 {
-    BMK_result_t testResult;
+    size_t pos = 0, n, blockSize;
+    U32 maxNbBlocks, blockNb = 0;
+    buff->srcSize = 0;
+    for(n = 0; n < nbFiles; n++) {
+        buff->srcSize += fileSizes[n];
+    }
+
+    if(buff->srcSize == 0) {
+        DISPLAY("No data to bench\n");
+        return 1;
+    }
+
+    blockSize = g_blockSize ? g_blockSize : buff->srcSize;
+    maxNbBlocks = (U32) ((buff->srcSize + (blockSize-1)) / blockSize) + (U32)nbFiles;
+
+    buff->srcPtrs = (const void**)calloc(maxNbBlocks, sizeof(void*));
+    buff->srcSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    buff->dstPtrs = (void**)calloc(maxNbBlocks, sizeof(void*));
+    buff->dstCapacities = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+    buff->dstSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    buff->resPtrs = (void**)calloc(maxNbBlocks, sizeof(void*));
+    buff->resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));
+
+    if(!buff->srcPtrs || !buff->srcSizes || !buff->dstPtrs || !buff->dstCapacities || !buff->dstSizes || !buff->resPtrs || !buff->resSizes) {
+        DISPLAY("alloc error\n");
+        freeNonSrcBuffers(*buff);
+        return 1;
+    }
+
+    buff->srcBuffer = srcBuffer;
+    buff->srcPtrs[0] = (const void*)buff->srcBuffer;
+    buff->dstPtrs[0] = malloc(ZSTD_compressBound(buff->srcSize) + (maxNbBlocks * 1024));
+    buff->resPtrs[0] = malloc(buff->srcSize);
+
+    if(!buff->dstPtrs[0] || !buff->resPtrs[0]) {
+        DISPLAY("alloc error\n");
+        freeNonSrcBuffers(*buff);
+        return 1;
+    }
+
+    for(n = 0; n < nbFiles; n++) {
+        size_t pos_end = pos + fileSizes[n];
+        for(; pos < pos_end; blockNb++) {
+            buff->srcPtrs[blockNb] = (const void*)((char*)srcBuffer + pos);
+            buff->srcSizes[blockNb] = blockSize;
+            pos += blockSize;
+        }
+
+        if(fileSizes[n] > 0) { buff->srcSizes[blockNb - 1] = ((fileSizes[n] - 1) % blockSize) + 1; }
+        pos = pos_end;
+    }
+
+    buff->dstCapacities[0] = ZSTD_compressBound(buff->srcSizes[0]);
+    buff->dstSizes[0] = buff->dstCapacities[0];
+    buff->resSizes[0] = buff->srcSizes[0];
+    buff->maxBlockSize = buff->srcSizes[0];
+
+    for(n = 1; n < blockNb; n++) {
+        buff->dstPtrs[n] = ((char*)buff->dstPtrs[n-1]) + buff->dstCapacities[n-1];
+        buff->resPtrs[n] = ((char*)buff->resPtrs[n-1]) + buff->resSizes[n-1];
+        buff->dstCapacities[n] = ZSTD_compressBound(buff->srcSizes[n]);
+        buff->dstSizes[n] = buff->dstCapacities[n];
+        buff->resSizes[n] = buff->srcSizes[n];
+
+        buff->maxBlockSize = MAX(buff->maxBlockSize, buff->srcSizes[n]);
+    }
+
+    buff->nbBlocks = blockNb;
+
+    return 0;
+}
+
+/* allocates buffer's arguments. returns success / failuere */
+static int createBuffers(buffers_t* buff, const char* const * const fileNamesTable,
+                          size_t nbFiles) {
+    size_t pos = 0;
+    size_t n;
+    size_t totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, (U32)nbFiles);
+    size_t benchedSize = MIN(BMK_findMaxMem(totalSizeToLoad * 3) / 3, totalSizeToLoad);
+    size_t* fileSizes = calloc(sizeof(size_t), nbFiles);
+    void* srcBuffer = NULL;
+    int ret = 0;
+
+    if(!totalSizeToLoad || !benchedSize) {
+        ret = 1;
+        DISPLAY("Nothing to Bench\n");
+        goto _cleanUp;
+    }
+
+    srcBuffer = malloc(benchedSize);
+
+    if(!fileSizes || !srcBuffer) {
+        ret = 1;
+        goto _cleanUp;
+    }
+
+    for(n = 0; n < nbFiles; n++) {
+        FILE* f;
+        U64 fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        if (UTIL_isDirectory(fileNamesTable[n])) {
+            DISPLAY("Ignoring %s directory...       \n", fileNamesTable[n]);
+            continue;
+        }
+        if (fileSize == UTIL_FILESIZE_UNKNOWN) {
+            DISPLAY("Cannot evaluate size of %s, ignoring ... \n", fileNamesTable[n]);
+            continue;
+        }
+        f = fopen(fileNamesTable[n], "rb");
+        if (f==NULL) {
+            DISPLAY("impossible to open file %s\n", fileNamesTable[n]);
+            fclose(f);
+            ret = 10;
+            goto _cleanUp;
+        }
+
+        DISPLAYLEVEL(2, "Loading %s...       \r", fileNamesTable[n]);
+
+        if (fileSize + pos > benchedSize) fileSize = benchedSize - pos, nbFiles=n;   /* buffer too small - stop after this file */
+        {
+            char* buffer = (char*)(srcBuffer);
+            size_t const readSize = fread((buffer)+pos, 1, (size_t)fileSize, f);
+            fclose(f);
+            if (readSize != (size_t)fileSize) {
+                DISPLAY("could not read %s", fileNamesTable[n]);
+                ret = 1;
+                goto _cleanUp;
+            }
+
+            fileSizes[n] = readSize;
+            pos += readSize;
+        }
+    }
+
+    ret = createBuffersFromMemory(buff, srcBuffer, nbFiles, fileSizes);
+
+_cleanUp:
+    if(ret) { free(srcBuffer); }
+    free(fileSizes);
+    return ret;
+}
+
+static void freeContexts(const contexts_t ctx) {
+    free(ctx.dictBuffer);
+    ZSTD_freeCCtx(ctx.cctx);
+    ZSTD_freeDCtx(ctx.dctx);
+}
+
+static int createContexts(contexts_t* ctx, const char* dictFileName) {
+    FILE* f;
+    size_t readSize;
+    ctx->cctx = ZSTD_createCCtx();
+    ctx->dctx = ZSTD_createDCtx();
+    assert(ctx->cctx != NULL);
+    assert(ctx->dctx != NULL);
+
+    if(dictFileName == NULL) {
+        ctx->dictSize = 0;
+        ctx->dictBuffer = NULL;
+        return 0;
+    }
+    {   U64 const dictFileSize = UTIL_getFileSize(dictFileName);
+        assert(dictFileSize != UTIL_FILESIZE_UNKNOWN);
+        ctx->dictSize = dictFileSize;
+        assert((U64)ctx->dictSize == dictFileSize); /* check overflow */
+    }
+    ctx->dictBuffer = malloc(ctx->dictSize);
+
+    f = fopen(dictFileName, "rb");
+
+    if (f==NULL) {
+        DISPLAY("unable to open file\n");
+        freeContexts(*ctx);
+        return 1;
+    }
+
+    if (ctx->dictSize > 64 MB || !(ctx->dictBuffer)) {
+        DISPLAY("dictionary too large\n");
+        fclose(f);
+        freeContexts(*ctx);
+        return 1;
+    }
+    readSize = fread(ctx->dictBuffer, 1, ctx->dictSize, f);
+    fclose(f);
+    if (readSize != ctx->dictSize) {
+        DISPLAY("unable to read file\n");
+        freeContexts(*ctx);
+        return 1;
+    }
+    return 0;
+}
+
+/*-************************************
+*  Optimizer Memoization Functions
+**************************************/
+
+/* return: new length */
+/* keep old array, will need if iter over strategy. */
+/* prunes useless params */
+static size_t sanitizeVarArray(varInds_t* varNew, const size_t varLength, const varInds_t* varArray, const ZSTD_strategy strat) {
+    size_t i, j = 0;
+    for(i = 0; i < varLength; i++) {
+        if( !((varArray[i] == clog_ind && strat == ZSTD_fast)
+            || (varArray[i] == slog_ind && strat == ZSTD_fast)
+            || (varArray[i] == slog_ind && strat == ZSTD_dfast)
+            || (varArray[i] == tlen_ind && strat != ZSTD_btopt && strat != ZSTD_btultra && strat != ZSTD_fast))) {
+            varNew[j] = varArray[i];
+            j++;
+        }
+    }
+    return j;
+}
+
+/* res should be NUM_PARAMS size */
+/* constructs varArray from paramValues_t style parameter */
+/* pass in using dict. */
+static size_t variableParams(const paramValues_t paramConstraints, varInds_t* res, const int usingDictionary) {
+    varInds_t i;
+    size_t j = 0;
+    for(i = 0; i < NUM_PARAMS; i++) {
+        if(paramConstraints.vals[i] == PARAM_UNSET) {
+            if(i == fadt_ind && !usingDictionary) continue; /* don't use fadt if no dictionary */
+            res[j] = i; j++;
+        }
+    }
+    return j;
+}
+
+/* length of memo table given free variables */
+static size_t memoTableLen(const varInds_t* varyParams, const size_t varyLen) {
+    size_t arrayLen = 1;
+    size_t i;
+    for(i = 0; i < varyLen; i++) {
+        if(varyParams[i] == strt_ind) continue; /* strategy separated by table */
+        arrayLen *= rangetable[varyParams[i]];
+    }
+    return arrayLen;
+}
+
+/* returns unique index in memotable of compression parameters */
+static unsigned memoTableIndDirect(const paramValues_t* ptr, const varInds_t* varyParams, const size_t varyLen) {
+    size_t i;
+    unsigned ind = 0;
+    for(i = 0; i < varyLen; i++) {
+        varInds_t v = varyParams[i];
+        if(v == strt_ind) continue; /* exclude strategy from memotable */
+        ind *= rangetable[v]; ind += (unsigned)invRangeMap(v, ptr->vals[v]);
+    }
+    return ind;
+}
+
+static size_t memoTableGet(const memoTable_t* memoTableArray, const paramValues_t p) {
+    const memoTable_t mt = memoTableArray[p.vals[strt_ind]];
+    switch(mt.tableType) {
+        case directMap:
+            return mt.table[memoTableIndDirect(&p, mt.varArray, mt.varLen)];
+        case xxhashMap:
+            return mt.table[(XXH64(&p.vals, sizeof(U32) * NUM_PARAMS, 0) >> 3) % mt.tableLen];
+        case noMemo:
+            return 0;
+    }
+    return 0; /* should never happen, stop compiler warnings */
+}
+
+static void memoTableSet(const memoTable_t* memoTableArray, const paramValues_t p, const BYTE value) {
+    const memoTable_t mt = memoTableArray[p.vals[strt_ind]];
+    switch(mt.tableType) {
+        case directMap:
+            mt.table[memoTableIndDirect(&p, mt.varArray, mt.varLen)] = value; break;
+        case xxhashMap:
+            mt.table[(XXH64(&p.vals, sizeof(U32) * NUM_PARAMS, 0) >> 3) % mt.tableLen] = value; break;
+        case noMemo:
+            break;
+    }
+}
+
+/* frees all allocated memotables */
+static void freeMemoTableArray(memoTable_t* const mtAll) {
+    int i;
+    if(mtAll == NULL) { return; }
+    for(i = 1; i <= (int)ZSTD_btultra; i++) {
+        free(mtAll[i].table);
+    }
+    free(mtAll);
+}
+
+/* inits memotables for all (including mallocs), all strategies */
+/* takes unsanitized varyParams */
+static memoTable_t* createMemoTableArray(const paramValues_t p, const varInds_t* const varyParams, const size_t varyLen, const U32 memoTableLog) {
+    memoTable_t* mtAll = (memoTable_t*)calloc(sizeof(memoTable_t),(ZSTD_btultra + 1));
+    ZSTD_strategy i, stratMin = ZSTD_fast, stratMax = ZSTD_btultra;
+
+    if(mtAll == NULL) {
+        return NULL;
+    }
+
+    for(i = 1; i <= (int)ZSTD_btultra; i++) {
+        mtAll[i].varLen = sanitizeVarArray(mtAll[i].varArray, varyLen, varyParams, i);
+    }
+
+    /* no memoization */
+    if(memoTableLog == 0) {
+        for(i = 1; i <= (int)ZSTD_btultra; i++) {
+            mtAll[i].tableType = noMemo;
+            mtAll[i].table = NULL;
+            mtAll[i].tableLen = 0;
+        }
+        return mtAll;
+    }
+
+
+    if(p.vals[strt_ind] != PARAM_UNSET) {
+        stratMin = p.vals[strt_ind];
+        stratMax = p.vals[strt_ind];
+    }
+
+
+    for(i = stratMin; i <= stratMax; i++) {
+        size_t mtl = memoTableLen(mtAll[i].varArray, mtAll[i].varLen);
+        mtAll[i].tableType = directMap;
+
+        if(memoTableLog != PARAM_UNSET && mtl > (1ULL << memoTableLog)) { /* use hash table */ /* provide some option to only use hash tables? */
+            mtAll[i].tableType = xxhashMap;
+            mtl = (1ULL << memoTableLog);
+        }
+
+        mtAll[i].table = (BYTE*)calloc(sizeof(BYTE), mtl);
+        mtAll[i].tableLen = mtl;
+
+        if(mtAll[i].table == NULL) {
+            freeMemoTableArray(mtAll);
+            return NULL;
+        }
+    }
+
+    return mtAll;
+}
+
+/* Sets pc to random unmeasured set of parameters */
+/* specifiy strategy */
+static void randomConstrainedParams(paramValues_t* pc, const memoTable_t* memoTableArray, const ZSTD_strategy st)
+{
+    size_t j;
+    const memoTable_t mt = memoTableArray[st];
+    pc->vals[strt_ind] = st;
+    for(j = 0; j < mt.tableLen; j++) {
+        int i;
+        for(i = 0; i < NUM_PARAMS; i++) {
+            varInds_t v = mt.varArray[i];
+            if(v == strt_ind) continue;
+            pc->vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
+        }
+
+        if(!(memoTableGet(memoTableArray, *pc))) break; /* only pick unpicked params. */
+    }
+}
+
+/*-************************************
+*  Benchmarking Functions
+**************************************/
+
+/* Replicate functionality of benchMemAdvanced, but with pre-split src / dst buffers */
+/* The purpose is so that sufficient information is returned so that a decompression call to benchMemInvertible is possible */
+/* BMK_benchMemAdvanced(srcBuffer,srcSize, dstBuffer, dstSize, fileSizes, nbFiles, 0, &cParams, dictBuffer, dictSize, ctx, dctx, 0, "File", &adv); */
+/* nbSeconds used in same way as in BMK_advancedParams_t */
+/* if in decodeOnly, then srcPtr's will be compressed blocks, and uncompressedBlocks will be written to dstPtrs */
+/* dictionary nullable, nothing else though. */
+/* note : it would be better if this function was in bench.c, sharing code with benchMemAdvanced(), since it's technically a part of it */
+static BMK_benchOutcome_t
+BMK_benchMemInvertible( buffers_t buf, contexts_t ctx,
+                        int cLevel, const paramValues_t* comprParams,
+                        BMK_mode_t mode, unsigned nbSeconds)
+{
+    U32 i;
+    BMK_benchResult_t bResult;
+    const void *const *const srcPtrs = (const void *const *const)buf.srcPtrs;
+    size_t const *const srcSizes = buf.srcSizes;
+    void** const dstPtrs = buf.dstPtrs;
+    size_t const *const dstCapacities = buf.dstCapacities;
+    size_t* const dstSizes = buf.dstSizes;
+    void** const resPtrs = buf.resPtrs;
+    size_t const *const resSizes = buf.resSizes;
+    const void* dictBuffer = ctx.dictBuffer;
+    const size_t dictBufferSize = ctx.dictSize;
+    const size_t nbBlocks = buf.nbBlocks;
+    const size_t srcSize = buf.srcSize;
+    ZSTD_CCtx* cctx = ctx.cctx;
+    ZSTD_DCtx* dctx = ctx.dctx;
+
+    /* init */
+    memset(&bResult, 0, sizeof(bResult));
+
+    /* warmimg up memory */
+    for (i = 0; i < buf.nbBlocks; i++) {
+        if (mode != BMK_decodeOnly) {
+            RDG_genBuffer(dstPtrs[i], dstCapacities[i], 0.10, 0.50, 1);
+        } else {
+            RDG_genBuffer(resPtrs[i], resSizes[i], 0.10, 0.50, 1);
+        }
+    }
+
+    /* Bench */
+    {
+        /* init args */
+        int compressionCompleted = (mode == BMK_decodeOnly);
+        int decompressionCompleted = (mode == BMK_compressOnly);
+        BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
+        BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
+        BMK_initCCtxArgs cctxprep;
+        BMK_initDCtxArgs dctxprep;
+        cctxprep.cctx = cctx;
+        cctxprep.dictBuffer = dictBuffer;
+        cctxprep.dictBufferSize = dictBufferSize;
+        cctxprep.cLevel = cLevel;
+        cctxprep.comprParams = comprParams;
+        dctxprep.dctx = dctx;
+        dctxprep.dictBuffer = dictBuffer;
+        dctxprep.dictBufferSize = dictBufferSize;
+
+        assert(timeStateCompress != NULL);
+        assert(timeStateDecompress != NULL);
+        while(!compressionCompleted) {
+            BMK_runOutcome_t const cOutcome = BMK_benchTimedFn(timeStateCompress,
+                                            &local_defaultCompress, cctx,
+                                            &local_initCCtx, &cctxprep,
+                                            nbBlocks,
+                                            srcPtrs, srcSizes,
+                                            dstPtrs, dstCapacities,
+                                            dstSizes);
+
+            if (!BMK_isSuccessful_runOutcome(cOutcome)) {
+                BMK_benchOutcome_t bOut;
+                memset(&bOut, 0, sizeof(bOut));
+                bOut.tag = 1;   /* should rather be a function or a constant */
+                BMK_freeTimedFnState(timeStateCompress);
+                BMK_freeTimedFnState(timeStateDecompress);
+                return bOut;
+            }
+            {   BMK_runTime_t const rResult = BMK_extract_runTime(cOutcome);
+                bResult.cSpeed = (srcSize * TIMELOOP_NANOSEC) / rResult.nanoSecPerRun;
+                bResult.cSize = rResult.sumOfReturn;
+            }
+            compressionCompleted = BMK_isCompleted_TimedFn(timeStateCompress);
+        }
+
+        while (!decompressionCompleted) {
+            BMK_runOutcome_t const dOutcome = BMK_benchTimedFn(timeStateDecompress,
+                                        &local_defaultDecompress, dctx,
+                                        &local_initDCtx, &dctxprep,
+                                        nbBlocks,
+                                        (const void* const*)dstPtrs, dstSizes,
+                                        resPtrs, resSizes,
+                                        NULL);
+
+            if (!BMK_isSuccessful_runOutcome(dOutcome)) {
+                BMK_benchOutcome_t bOut;
+                memset(&bOut, 0, sizeof(bOut));
+                bOut.tag = 1;   /* should rather be a function or a constant */
+                BMK_freeTimedFnState(timeStateCompress);
+                BMK_freeTimedFnState(timeStateDecompress);
+                return bOut;
+            }
+            {   BMK_runTime_t const rResult = BMK_extract_runTime(dOutcome);
+                bResult.dSpeed = (srcSize * TIMELOOP_NANOSEC) / rResult.nanoSecPerRun;
+            }
+            decompressionCompleted = BMK_isCompleted_TimedFn(timeStateDecompress);
+        }
+
+        BMK_freeTimedFnState(timeStateCompress);
+        BMK_freeTimedFnState(timeStateDecompress);
+    }
+
+   /* Bench */
+    bResult.cMem = (1 << (comprParams->vals[wlog_ind])) + ZSTD_sizeof_CCtx(cctx);
+
+    {   BMK_benchOutcome_t bOut;
+        bOut.tag = 0;
+        bOut.internal_never_use_directly = bResult;  /* should be a function */
+        return bOut;
+    }
+}
+
+static int BMK_benchParam ( BMK_benchResult_t* resultPtr,
+                            buffers_t buf, contexts_t ctx,
+                            paramValues_t cParams)
+{
+    BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx,
+                                                        BASE_CLEVEL, &cParams,
+                                                        BMK_both, 3);
+    int const success = BMK_isSuccessful_benchOutcome(outcome);
+    if (!success) return 1;
+    *resultPtr = BMK_extract_benchResult(outcome);
+    return 0;
+}
+
+
+#define CBENCHMARK(conditional, resultvar, tmpret, mode, sec) {                                                 \
+    if(conditional) {                                                                                           \
+        BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, mode, sec);  \
+        if (!BMK_isSuccessful_benchOutcome(outcome)) {                                                          \
+            DEBUGOUTPUT("Benchmarking failed\n");                                                               \
+            return ERROR_RESULT;                                                                                \
+        }                                                                                                       \
+        {   BMK_benchResult_t const tmpResult = BMK_extract_benchResult(outcome);                               \
+            if (mode != BMK_decodeOnly)  {                                                                      \
+                resultvar.cSpeed = tmpResult.cSpeed;                                                            \
+                resultvar.cSize = tmpResult.cSize;                                                              \
+                resultvar.cMem = tmpResult.cMem;                                                                \
+            }                                                                                                   \
+            if (mode != BMK_compressOnly) { resultvar.dSpeed = tmpResult.dSpeed; }                              \
+    }   }                                                                                                       \
+}
+
+/* Benchmarking which stops when we are sufficiently sure the solution is infeasible / worse than the winner */
+#define VARIANCE 1.2
+static int allBench(BMK_benchResult_t* resultPtr,
+                const buffers_t buf, const contexts_t ctx,
+                const paramValues_t cParams,
+                const constraint_t target,
+                BMK_benchResult_t* winnerResult, int feas)
+{
+    BMK_benchResult_t benchres;
+    double uncertaintyConstantC = 3., uncertaintyConstantD = 3.;
+    double winnerRS;
+
+    BMK_benchOutcome_t const outcome = BMK_benchMemInvertible(buf, ctx, BASE_CLEVEL, &cParams, BMK_both, 2);
+    if (!BMK_isSuccessful_benchOutcome(outcome)) {
+        DEBUGOUTPUT("Benchmarking failed \n");
+        return ERROR_RESULT;
+    }
+    benchres = BMK_extract_benchResult(outcome);
+
+    winnerRS = resultScore(*winnerResult, buf.srcSize, target);
+    DEBUGOUTPUT("WinnerScore: %f \n ", winnerRS);
+
+    *resultPtr = benchres;
+
+    /* anything with worse ratio in feas is definitely worse, discard */
+    if(feas && benchres.cSize < winnerResult->cSize && !g_optmode) {
+        return WORSE_RESULT;
+    }
+
+    /* calculate uncertainty in compression / decompression runs */
+    if (benchres.cSpeed) {
+        U64 const loopDurationC = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.cSpeed);
+        uncertaintyConstantC = ((loopDurationC + (double)(2 * g_clockGranularity))/loopDurationC);
+    }
+
+    if (benchres.dSpeed) {
+        U64 const loopDurationD = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.dSpeed);
+        uncertaintyConstantD = ((loopDurationD + (double)(2 * g_clockGranularity))/loopDurationD);
+    }
+
+    /* optimistic assumption of benchres */
+    {   BMK_benchResult_t resultMax = benchres;
+        resultMax.cSpeed *= uncertaintyConstantC * VARIANCE;
+        resultMax.dSpeed *= uncertaintyConstantD * VARIANCE;
+
+        /* disregard infeasible results in feas mode */
+        /* disregard if resultMax < winner in infeas mode */
+        if((feas && !feasible(resultMax, target)) ||
+          (!feas && (winnerRS > resultScore(resultMax, buf.srcSize, target)))) {
+            return WORSE_RESULT;
+        }
+    }
+
+    /* compare by resultScore when in infeas */
+    /* compare by compareResultLT when in feas */
+    if((!feas && (resultScore(benchres, buf.srcSize, target) > resultScore(*winnerResult, buf.srcSize, target))) ||
+       (feas && (compareResultLT(*winnerResult, benchres, target, buf.srcSize))) )  {
+        return BETTER_RESULT;
+    } else {
+        return WORSE_RESULT;
+    }
+}
+
+
+#define INFEASIBLE_THRESHOLD 200
+/* Memoized benchmarking, won't benchmark anything which has already been benchmarked before. */
+static int benchMemo(BMK_benchResult_t* resultPtr,
+                const buffers_t buf, const contexts_t ctx,
+                const paramValues_t cParams,
+                const constraint_t target,
+                BMK_benchResult_t* winnerResult, memoTable_t* const memoTableArray,
+                const int feas) {
+    static int bmcount = 0;
+    int res;
+
+    if ( memoTableGet(memoTableArray, cParams) >= INFEASIBLE_THRESHOLD
+      || redundantParams(cParams, target, buf.maxBlockSize) ) {
+        return WORSE_RESULT;
+    }
+
+    res = allBench(resultPtr, buf, ctx, cParams, target, winnerResult, feas);
+
+    if(DEBUG && !(bmcount % 250)) {
+        DISPLAY("Count: %d\n", bmcount);
+        bmcount++;
+    }
+    BMK_printWinnerOpt(stdout, CUSTOM_LEVEL, *resultPtr, cParams, target, buf.srcSize);
+
+    if(res == BETTER_RESULT || feas) {
+        memoTableSet(memoTableArray, cParams, 255); /* what happens if collisions are frequent */
+    }
+    return res;
+}
+
+
+typedef struct {
+    U64 cSpeed_min;
+    U64 dSpeed_min;
+    U32 windowLog_max;
+    ZSTD_strategy strategy_max;
+} level_constraints_t;
+
+static level_constraints_t g_level_constraint[NB_LEVELS_TRACKED+1];
+
+static void BMK_init_level_constraints(int bytePerSec_level1)
+{
+    assert(NB_LEVELS_TRACKED >= ZSTD_maxCLevel());
+    memset(g_level_constraint, 0, sizeof(g_level_constraint));
+    g_level_constraint[1].cSpeed_min = bytePerSec_level1;
+    g_level_constraint[1].dSpeed_min = 0.;
+    g_level_constraint[1].windowLog_max = 19;
+    g_level_constraint[1].strategy_max = ZSTD_fast;
+
+    /* establish speed objectives (relative to level 1) */
+    {   int l;
+        for (l=2; l<=NB_LEVELS_TRACKED; l++) {
+            g_level_constraint[l].cSpeed_min = (g_level_constraint[l-1].cSpeed_min * 49) / 64;
+            g_level_constraint[l].dSpeed_min = 0.;
+            g_level_constraint[l].windowLog_max = (l<20) ? 23 : l+5;   /* only --ultra levels >= 20 can use windowlog > 23 */
+            g_level_constraint[l].strategy_max = (l<19) ? ZSTD_btopt : ZSTD_btultra;   /* level 19 is allowed to use btultra */
+    }   }
+}
+
+static int BMK_seed(winnerInfo_t* winners, const paramValues_t params,
+                    const buffers_t buf, const contexts_t ctx)
+{
+    BMK_benchResult_t testResult;
     int better = 0;
     int cLevel;
 
-    BMK_benchParam(&testResult, srcBuffer, srcSize, ctx, params);
+    BMK_benchParam(&testResult, buf, ctx, params);
 
-    for (cLevel = 1; cLevel <= ZSTD_maxCLevel(); cLevel++) {
-        if (testResult.cSpeed < g_cSpeedTarget[cLevel])
+
+    for (cLevel = 1; cLevel <= NB_LEVELS_TRACKED; cLevel++) {
+        if (testResult.cSpeed < g_level_constraint[cLevel].cSpeed_min)
             continue;   /* not fast enough for this level */
+        if (testResult.dSpeed < g_level_constraint[cLevel].dSpeed_min)
+            continue;   /* not fast enough for this level */
+        if (params.vals[wlog_ind] > g_level_constraint[cLevel].windowLog_max)
+            continue;   /* too much memory for this level */
+        if (params.vals[strt_ind] > g_level_constraint[cLevel].strategy_max)
+            continue;   /* forbidden strategy for this level */
         if (winners[cLevel].result.cSize==0) {
             /* first solution for this cLevel */
             winners[cLevel].result = testResult;
             winners[cLevel].params = params;
-            BMK_printWinner(stdout, cLevel, testResult, params, srcSize);
+            BMK_printWinner(stdout, cLevel, testResult, params, buf.srcSize);
             better = 1;
             continue;
         }
 
         if ((double)testResult.cSize <= ((double)winners[cLevel].result.cSize * (1. + (0.02 / cLevel))) ) {
             /* Validate solution is "good enough" */
-            double W_ratio = (double)srcSize / testResult.cSize;
-            double O_ratio = (double)srcSize / winners[cLevel].result.cSize;
+            double W_ratio = (double)buf.srcSize / testResult.cSize;
+            double O_ratio = (double)buf.srcSize / winners[cLevel].result.cSize;
             double W_ratioNote = log (W_ratio);
             double O_ratioNote = log (O_ratio);
-            size_t W_DMemUsed = (1 << params.windowLog) + (16 KB);
-            size_t O_DMemUsed = (1 << winners[cLevel].params.windowLog) + (16 KB);
+            size_t W_DMemUsed = (1 << params.vals[wlog_ind]) + (16 KB);
+            size_t O_DMemUsed = (1 << winners[cLevel].params.vals[wlog_ind]) + (16 KB);
             double W_DMemUsed_note = W_ratioNote * ( 40 + 9*cLevel) - log((double)W_DMemUsed);
             double O_DMemUsed_note = O_ratioNote * ( 40 + 9*cLevel) - log((double)O_DMemUsed);
 
-            size_t W_CMemUsed = (1 << params.windowLog) + ZSTD_estimateCCtxSize_usingCParams(params);
-            size_t O_CMemUsed = (1 << winners[cLevel].params.windowLog) + ZSTD_estimateCCtxSize_usingCParams(winners[cLevel].params);
+            size_t W_CMemUsed = (1 << params.vals[wlog_ind]) + ZSTD_estimateCCtxSize_usingCParams(pvalsToCParams(params));
+            size_t O_CMemUsed = (1 << winners[cLevel].params.vals[wlog_ind]) + ZSTD_estimateCCtxSize_usingCParams(pvalsToCParams(winners[cLevel].params));
             double W_CMemUsed_note = W_ratioNote * ( 50 + 13*cLevel) - log((double)W_CMemUsed);
             double O_CMemUsed_note = O_ratioNote * ( 50 + 13*cLevel) - log((double)O_CMemUsed);
 
@@ -414,16 +1736,16 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_compressionParameters para
                 /* too large compression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Compression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
-                         W_ratio, testResult.cSpeed / 1000000,
-                         O_ratio, winners[cLevel].result.cSpeed / 1000000.,   cLevel);
+                         W_ratio, (double)testResult.cSpeed / MB_UNIT,
+                         O_ratio, (double)winners[cLevel].result.cSpeed / MB_UNIT,   cLevel);
                 continue;
             }
             if (W_DSpeed_note   < O_DSpeed_note  ) {
                 /* too large decompression speed difference for the compression benefit */
                 if (W_ratio > O_ratio)
                 DISPLAY ("Decompression Speed : %5.3f @ %4.1f MB/s  vs  %5.3f @ %4.1f MB/s   : not enough for level %i\n",
-                         W_ratio, testResult.dSpeed / 1000000.,
-                         O_ratio, winners[cLevel].result.dSpeed / 1000000.,   cLevel);
+                         W_ratio, (double)testResult.dSpeed / MB_UNIT,
+                         O_ratio, (double)winners[cLevel].result.dSpeed / MB_UNIT,   cLevel);
                 continue;
             }
 
@@ -432,7 +1754,7 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_compressionParameters para
 
             winners[cLevel].result = testResult;
             winners[cLevel].params = params;
-            BMK_printWinner(stdout, cLevel, testResult, params, srcSize);
+            BMK_printWinner(stdout, cLevel, testResult, params, buf.srcSize);
 
             better = 1;
     }   }
@@ -440,409 +1762,683 @@ static int BMK_seed(winnerInfo_t* winners, const ZSTD_compressionParameters para
     return better;
 }
 
-
-/* nullified useless params, to ensure count stats */
-static ZSTD_compressionParameters* sanitizeParams(ZSTD_compressionParameters params)
-{
-    g_params = params;
-    if (params.strategy == ZSTD_fast)
-        g_params.chainLog = 0, g_params.searchLog = 0;
-    if (params.strategy == ZSTD_dfast)
-        g_params.searchLog = 0;
-    if (params.strategy != ZSTD_btopt && params.strategy != ZSTD_btultra)
-        g_params.targetLength = 0;
-    return &g_params;
-}
-
-
-static void paramVariation(ZSTD_compressionParameters* ptr)
-{
-    ZSTD_compressionParameters p;
-    U32 validated = 0;
-    while (!validated) {
-        U32 nbChanges = (FUZ_rand(&g_rand) & 3) + 1;
-        p = *ptr;
-        for ( ; nbChanges ; nbChanges--) {
-            const U32 changeID = FUZ_rand(&g_rand) % 14;
-            switch(changeID)
-            {
-            case 0:
-                p.chainLog++; break;
-            case 1:
-                p.chainLog--; break;
-            case 2:
-                p.hashLog++; break;
-            case 3:
-                p.hashLog--; break;
-            case 4:
-                p.searchLog++; break;
-            case 5:
-                p.searchLog--; break;
-            case 6:
-                p.windowLog++; break;
-            case 7:
-                p.windowLog--; break;
-            case 8:
-                p.searchLength++; break;
-            case 9:
-                p.searchLength--; break;
-            case 10:
-                p.strategy = (ZSTD_strategy)(((U32)p.strategy)+1); break;
-            case 11:
-                p.strategy = (ZSTD_strategy)(((U32)p.strategy)-1); break;
-            case 12:
-                p.targetLength *= 1 + ((double)(FUZ_rand(&g_rand)&255)) / 256.; break;
-            case 13:
-                p.targetLength /= 1 + ((double)(FUZ_rand(&g_rand)&255)) / 256.; break;
-            }
-        }
-        validated = !ZSTD_isError(ZSTD_checkCParams(p));
-    }
-    *ptr = p;
-}
-
+/*-************************************
+*  Compression Level Table Generation Functions
+**************************************/
 
 #define PARAMTABLELOG   25
 #define PARAMTABLESIZE (1<<PARAMTABLELOG)
 #define PARAMTABLEMASK (PARAMTABLESIZE-1)
 static BYTE g_alreadyTested[PARAMTABLESIZE] = {0};   /* init to zero */
 
-#define NB_TESTS_PLAYED(p) \
-    g_alreadyTested[(XXH64(sanitizeParams(p), sizeof(p), 0) >> 3) & PARAMTABLEMASK]
-
+static BYTE* NB_TESTS_PLAYED(paramValues_t p) {
+    ZSTD_compressionParameters p2 = pvalsToCParams(sanitizeParams(p));
+    return &g_alreadyTested[(XXH64((void*)&p2, sizeof(p2), 0) >> 3) & PARAMTABLEMASK];
+}
 
 static void playAround(FILE* f, winnerInfo_t* winners,
-                       ZSTD_compressionParameters params,
-                       const void* srcBuffer, size_t srcSize,
-                       ZSTD_CCtx* ctx)
+                       paramValues_t p,
+                       const buffers_t buf, const contexts_t ctx)
 {
-    int nbVariations = 0;
+    int nbVariations = 0, i;
     UTIL_time_t const clockStart = UTIL_getTime();
 
     while (UTIL_clockSpanMicro(clockStart) < g_maxVariationTime) {
-        ZSTD_compressionParameters p = params;
+        BYTE* b;
 
         if (nbVariations++ > g_maxNbVariations) break;
-        paramVariation(&p);
+
+        do { for(i = 0; i < 4; i++) { paramVaryOnce(FUZ_rand(&g_rand) % (strt_ind + 1), ((FUZ_rand(&g_rand) & 1) << 1) - 1, &p); } }
+        while(!paramValid(p));
 
         /* exclude faster if already played params */
-        if (FUZ_rand(&g_rand) & ((1 << NB_TESTS_PLAYED(p))-1))
+        if (FUZ_rand(&g_rand) & ((1 << *NB_TESTS_PLAYED(p))-1))
             continue;
 
         /* test */
-        NB_TESTS_PLAYED(p)++;
-        if (!BMK_seed(winners, p, srcBuffer, srcSize, ctx)) continue;
+        b = NB_TESTS_PLAYED(p);
+        (*b)++;
+        if (!BMK_seed(winners, p, buf, ctx)) continue;
 
         /* improvement found => search more */
-        BMK_printWinners(f, winners, srcSize);
-        playAround(f, winners, p, srcBuffer, srcSize, ctx);
+        BMK_printWinners(f, winners, buf.srcSize);
+        playAround(f, winners, p, buf, ctx);
     }
 
 }
 
-
-static ZSTD_compressionParameters randomParams(void)
-{
-    ZSTD_compressionParameters p;
-    U32 validated = 0;
-    while (!validated) {
-        /* totally random entry */
-        p.chainLog   = (FUZ_rand(&g_rand) % (ZSTD_CHAINLOG_MAX+1 - ZSTD_CHAINLOG_MIN)) + ZSTD_CHAINLOG_MIN;
-        p.hashLog    = (FUZ_rand(&g_rand) % (ZSTD_HASHLOG_MAX+1 - ZSTD_HASHLOG_MIN)) + ZSTD_HASHLOG_MIN;
-        p.searchLog  = (FUZ_rand(&g_rand) % (ZSTD_SEARCHLOG_MAX+1 - ZSTD_SEARCHLOG_MIN)) + ZSTD_SEARCHLOG_MIN;
-        p.windowLog  = (FUZ_rand(&g_rand) % (ZSTD_WINDOWLOG_MAX+1 - ZSTD_WINDOWLOG_MIN)) + ZSTD_WINDOWLOG_MIN;
-        p.searchLength=(FUZ_rand(&g_rand) % (ZSTD_SEARCHLENGTH_MAX+1 - ZSTD_SEARCHLENGTH_MIN)) + ZSTD_SEARCHLENGTH_MIN;
-        p.targetLength=(FUZ_rand(&g_rand) % (512)) + ZSTD_TARGETLENGTH_MIN;
-        p.strategy   = (ZSTD_strategy) (FUZ_rand(&g_rand) % (ZSTD_btultra +1));
-        validated = !ZSTD_isError(ZSTD_checkCParams(p));
-    }
-    return p;
-}
-
 static void BMK_selectRandomStart(
                        FILE* f, winnerInfo_t* winners,
-                       const void* srcBuffer, size_t srcSize,
-                       ZSTD_CCtx* ctx)
+                       const buffers_t buf, const contexts_t ctx)
 {
-    U32 const id = (FUZ_rand(&g_rand) % (ZSTD_maxCLevel()+1));
-    if ((id==0) || (winners[id].params.windowLog==0)) {
-        /* totally random entry */
-        ZSTD_compressionParameters const p = ZSTD_adjustCParams(randomParams(), srcSize, 0);
-        playAround(f, winners, p, srcBuffer, srcSize, ctx);
+    U32 const id = FUZ_rand(&g_rand) % (NB_LEVELS_TRACKED+1);
+    if ((id==0) || (winners[id].params.vals[wlog_ind]==0)) {
+        /* use some random entry */
+        paramValues_t const p = adjustParams(cParamsToPVals(pvalsToCParams(randomParams())), /* defaults nonCompression parameters */
+            buf.srcSize, 0);
+        playAround(f, winners, p, buf, ctx);
+    } else {
+        playAround(f, winners, winners[id].params, buf, ctx);
     }
-    else
-        playAround(f, winners, winners[id].params, srcBuffer, srcSize, ctx);
 }
 
-
-static void BMK_benchMem(void* srcBuffer, size_t srcSize)
+static void BMK_benchFullTable(const buffers_t buf, const contexts_t ctx)
 {
-    ZSTD_CCtx* const ctx = ZSTD_createCCtx();
-    ZSTD_compressionParameters params;
-    winnerInfo_t winners[NB_LEVELS_TRACKED];
+    paramValues_t params;
+    winnerInfo_t winners[NB_LEVELS_TRACKED+1];
     const char* const rfName = "grillResults.txt";
     FILE* const f = fopen(rfName, "w");
-    const size_t blockSize = g_blockSize ? g_blockSize : srcSize;
 
     /* init */
-    if (ctx==NULL) { DISPLAY("ZSTD_createCCtx() failed \n"); exit(1); }
+    assert(g_singleRun==0);
     memset(winners, 0, sizeof(winners));
     if (f==NULL) { DISPLAY("error opening %s \n", rfName); exit(1); }
 
-    if (g_singleRun) {
-        BMK_result_t testResult;
-        g_params = ZSTD_adjustCParams(g_params, srcSize, 0);
-        BMK_benchParam(&testResult, srcBuffer, srcSize, ctx, g_params);
-        DISPLAY("\n");
-        return;
-    }
-
-    if (g_target)
-        g_cSpeedTarget[1] = g_target * 1000000;
-    else {
+    if (g_target) {
+        BMK_init_level_constraints(g_target * MB_UNIT);
+    } else {
         /* baseline config for level 1 */
-        BMK_result_t testResult;
-        params = ZSTD_getCParams(1, blockSize, 0);
-        BMK_benchParam(&testResult, srcBuffer, srcSize, ctx, params);
-        g_cSpeedTarget[1] = (testResult.cSpeed * 31) / 32;
-    }
-
-    /* establish speed objectives (relative to level 1) */
-    {   int i;
-        for (i=2; i<=ZSTD_maxCLevel(); i++)
-            g_cSpeedTarget[i] = (g_cSpeedTarget[i-1] * 25) / 32;
+        paramValues_t const l1params = cParamsToPVals(ZSTD_getCParams(1, buf.maxBlockSize, ctx.dictSize));
+        BMK_benchResult_t testResult;
+        BMK_benchParam(&testResult, buf, ctx, l1params);
+        BMK_init_level_constraints((int)((testResult.cSpeed * 31) / 32));
     }
 
     /* populate initial solution */
     {   const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
         int i;
         for (i=0; i<=maxSeeds; i++) {
-            params = ZSTD_getCParams(i, blockSize, 0);
-            BMK_seed(winners, params, srcBuffer, srcSize, ctx);
+            params = cParamsToPVals(ZSTD_getCParams(i, buf.maxBlockSize, 0));
+            BMK_seed(winners, params, buf, ctx);
     }   }
-    BMK_printWinners(f, winners, srcSize);
+    BMK_printWinners(f, winners, buf.srcSize);
 
     /* start tests */
-    {   const time_t grillStart = time(NULL);
+    {   const UTIL_time_t grillStart = UTIL_getTime();
         do {
-            BMK_selectRandomStart(f, winners, srcBuffer, srcSize, ctx);
-        } while (BMK_timeSpan(grillStart) < g_grillDuration_s);
+            BMK_selectRandomStart(f, winners, buf, ctx);
+        } while (BMK_timeSpan(grillStart) < g_timeLimit_s);
     }
 
     /* end summary */
-    BMK_printWinners(f, winners, srcSize);
+    BMK_printWinners(f, winners, buf.srcSize);
     DISPLAY("grillParams operations completed \n");
 
     /* clean up*/
     fclose(f);
-    ZSTD_freeCCtx(ctx);
 }
 
 
-static int benchSample(void)
-{
-    void* origBuff;
-    size_t const benchedSize = sampleSize;
-    const char* const name = "Sample 10MiB";
+/*-************************************
+*  Single Benchmark Functions
+**************************************/
 
-    /* Allocation */
-    origBuff = malloc(benchedSize);
-    if (!origBuff) { DISPLAY("\nError: not enough memory!\n"); return 12; }
+static int benchOnce(const buffers_t buf, const contexts_t ctx, const int cLevel) {
+    BMK_benchResult_t testResult;
+    g_params = adjustParams(overwriteParams(cParamsToPVals(ZSTD_getCParams(cLevel, buf.maxBlockSize, ctx.dictSize)), g_params), buf.maxBlockSize, ctx.dictSize);
 
-    /* Fill buffer */
-    RDG_genBuffer(origBuff, benchedSize, g_compressibility, 0.0, 0);
-
-    /* bench */
-    DISPLAY("\r%79s\r", "");
-    DISPLAY("using %s %i%%: \n", name, (int)(g_compressibility*100));
-    BMK_benchMem(origBuff, benchedSize);
-
-    free(origBuff);
-    return 0;
-}
-
-
-int benchFiles(const char** fileNamesTable, int nbFiles)
-{
-    int fileIdx=0;
-
-    /* Loop for each file */
-    while (fileIdx<nbFiles) {
-        const char* const inFileName = fileNamesTable[fileIdx++];
-        FILE* const inFile = fopen( inFileName, "rb" );
-        U64 const inFileSize = UTIL_getFileSize(inFileName);
-        size_t benchedSize;
-        void* origBuff;
-
-        /* Check file existence */
-        if (inFile==NULL) {
-            DISPLAY( "Pb opening %s\n", inFileName);
-            return 11;
-        }
-        if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-            DISPLAY("Pb evaluatin size of %s \n", inFileName);
-            fclose(inFile);
-            return 11;
-        }
-
-        /* Memory allocation */
-        benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-        if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-        if (benchedSize < inFileSize)
-            DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", inFileName, (int)(benchedSize>>20));
-        origBuff = malloc(benchedSize);
-        if (origBuff==NULL) {
-            DISPLAY("\nError: not enough memory!\n");
-            fclose(inFile);
-            return 12;
-        }
-
-        /* Fill input buffer */
-        DISPLAY("Loading %s...       \r", inFileName);
-        {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
-            fclose(inFile);
-            if(readSize != benchedSize) {
-                DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-                free(origBuff);
-                return 13;
-        }   }
-
-        /* bench */
-        DISPLAY("\r%79s\r", "");
-        DISPLAY("using %s : \n", inFileName);
-        BMK_benchMem(origBuff, benchedSize);
-
-        /* clean */
-        free(origBuff);
+    if (BMK_benchParam(&testResult, buf, ctx, g_params)) {
+        DISPLAY("Error during benchmarking\n");
+        return 1;
     }
 
+    BMK_printWinner(stdout, CUSTOM_LEVEL, testResult, g_params, buf.srcSize);
+
     return 0;
 }
 
-
-static void BMK_translateAdvancedParams(ZSTD_compressionParameters params)
+static int benchSample(double compressibility, int cLevel)
 {
-    DISPLAY("--zstd=windowLog=%u,chainLog=%u,hashLog=%u,searchLog=%u,searchLength=%u,targetLength=%u,strategy=%u \n",
-             params.windowLog, params.chainLog, params.hashLog, params.searchLog, params.searchLength, params.targetLength, (U32)(params.strategy));
-}
+    const char* const name = "Sample 10MB";
+    size_t const benchedSize = 10 MB;
+    void* const srcBuffer = malloc(benchedSize);
+    int ret = 0;
 
-/* optimizeForSize():
- * targetSpeed : expressed in MB/s */
-int optimizeForSize(const char* inFileName, U32 targetSpeed)
-{
-    FILE* const inFile = fopen( inFileName, "rb" );
-    U64 const inFileSize = UTIL_getFileSize(inFileName);
-    size_t benchedSize = BMK_findMaxMem(inFileSize*3) / 3;
-    void* origBuff;
+    buffers_t buf;
+    contexts_t ctx;
 
-    /* Init */
-    if (inFile==NULL) { DISPLAY( "Pb opening %s\n", inFileName); return 11; }
-    if (inFileSize == UTIL_FILESIZE_UNKNOWN) {
-        DISPLAY("Pb evaluatin size of %s \n", inFileName);
-        fclose(inFile);
-        return 11;
+    if(srcBuffer == NULL) {
+        DISPLAY("Out of Memory\n");
+        return 2;
     }
 
-    /* Memory allocation & restrictions */
-    if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-    if (benchedSize < inFileSize) {
-        DISPLAY("Not enough memory for '%s' \n", inFileName);
-        fclose(inFile);
-        return 11;
+    RDG_genBuffer(srcBuffer, benchedSize, compressibility, 0.0, 0);
+
+    if(createBuffersFromMemory(&buf, srcBuffer, 1, &benchedSize)) {
+        DISPLAY("Buffer Creation Error\n");
+        free(srcBuffer);
+        return 3;
     }
 
-    /* Alloc */
-    origBuff = malloc(benchedSize);
-    if(!origBuff) {
-        DISPLAY("\nError: not enough memory!\n");
-        fclose(inFile);
-        return 12;
+    if(createContexts(&ctx, NULL)) {
+        DISPLAY("Context Creation Error\n");
+        freeBuffers(buf);
+        return 1;
     }
 
-    /* Fill input buffer */
-    DISPLAY("Loading %s...       \r", inFileName);
-    {   size_t const readSize = fread(origBuff, 1, benchedSize, inFile);
-        fclose(inFile);
-        if(readSize != benchedSize) {
-            DISPLAY("\nError: problem reading file '%s' !!    \n", inFileName);
-            free(origBuff);
-            return 13;
-    }   }
-
     /* bench */
     DISPLAY("\r%79s\r", "");
-    DISPLAY("optimizing for %s - limit speed %u MB/s \n", inFileName, targetSpeed);
-    targetSpeed *= 1000000;
+    DISPLAY("using %s %i%%: \n", name, (int)(compressibility*100));
 
-    {   ZSTD_CCtx* const ctx = ZSTD_createCCtx();
-        winnerInfo_t winner;
-        BMK_result_t candidate;
-        const size_t blockSize = g_blockSize ? g_blockSize : benchedSize;
+    if(g_singleRun) {
+        ret = benchOnce(buf, ctx, cLevel);
+    } else {
+        BMK_benchFullTable(buf, ctx);
+    }
 
-        /* init */
-        if (ctx==NULL) { DISPLAY("\n ZSTD_createCCtx error \n"); free(origBuff); return 14;}
-        memset(&winner, 0, sizeof(winner));
-        winner.result.cSize = (size_t)(-1);
+    freeBuffers(buf);
+    freeContexts(ctx);
+
+    return ret;
+}
+
+/* benchFiles() :
+ * note: while this function takes a table of filenames,
+ * in practice, only the first filename will be used */
+static int benchFiles(const char** fileNamesTable, int nbFiles,
+                      const char* dictFileName, int cLevel)
+{
+    buffers_t buf;
+    contexts_t ctx;
+    int ret = 0;
+
+    if (createBuffers(&buf, fileNamesTable, nbFiles)) {
+        DISPLAY("unable to load files\n");
+        return 1;
+    }
+
+    if (createContexts(&ctx, dictFileName)) {
+        DISPLAY("unable to load dictionary\n");
+        freeBuffers(buf);
+        return 2;
+    }
+
+    DISPLAY("\r%79s\r", "");
+    if (nbFiles == 1) {
+        DISPLAY("using %s : \n", fileNamesTable[0]);
+    } else {
+        DISPLAY("using %d Files : \n", nbFiles);
+    }
+
+    if (g_singleRun) {
+        ret = benchOnce(buf, ctx, cLevel);
+    } else {
+        BMK_benchFullTable(buf, ctx);
+    }
+
+    freeBuffers(buf);
+    freeContexts(ctx);
+    return ret;
+}
+
+
+/*-************************************
+*  Local Optimization Functions
+**************************************/
+
+/* One iteration of hill climbing. Specifically, it first tries all
+ * valid parameter configurations w/ manhattan distance 1 and picks the best one
+ * failing that, it progressively tries candidates further and further away (up to #dim + 2)
+ * if it finds a candidate exceeding winnerInfo, it will repeat. Otherwise, it will stop the
+ * current stage of hill climbing.
+ * Each iteration of hill climbing proceeds in 2 'phases'. Phase 1 climbs according to
+ * the resultScore function, which is effectively a linear increase in reward until it reaches
+ * the constraint-satisfying value, it which point any excess results in only logarithmic reward.
+ * This aims to find some constraint-satisfying point.
+ * Phase 2 optimizes in accordance with what the original function sets out to maximize, with
+ * all feasible solutions valued over all infeasible solutions.
+ */
+
+/* sanitize all params here.
+ * all generation after random should be sanitized. (maybe sanitize random)
+ */
+static winnerInfo_t climbOnce(const constraint_t target,
+                memoTable_t* mtAll,
+                const buffers_t buf, const contexts_t ctx,
+                const paramValues_t init)
+{
+    /*
+     * cparam - currently considered 'center'
+     * candidate - params to benchmark/results
+     * winner - best option found so far.
+     */
+    paramValues_t cparam = init;
+    winnerInfo_t candidateInfo, winnerInfo;
+    int better = 1;
+    int feas = 0;
+
+    winnerInfo = initWinnerInfo(init);
+    candidateInfo = winnerInfo;
+
+    {   winnerInfo_t bestFeasible1 = initWinnerInfo(cparam);
+        DEBUGOUTPUT("Climb Part 1\n");
+        while(better) {
+            int offset;
+            size_t i, dist;
+            const size_t varLen = mtAll[cparam.vals[strt_ind]].varLen;
+            better = 0;
+            DEBUGOUTPUT("Start\n");
+            cparam = winnerInfo.params;
+            candidateInfo.params = cparam;
+             /* all dist-1 candidates */
+            for (i = 0; i < varLen; i++) {
+                for (offset = -1; offset <= 1; offset += 2) {
+                    CHECKTIME(winnerInfo);
+                    candidateInfo.params = cparam;
+                    paramVaryOnce(mtAll[cparam.vals[strt_ind]].varArray[i], offset, &candidateInfo.params);
+
+                    if(paramValid(candidateInfo.params)) {
+                        int res;
+                        res = benchMemo(&candidateInfo.result, buf, ctx,
+                            sanitizeParams(candidateInfo.params), target, &winnerInfo.result, mtAll, feas);
+                        DEBUGOUTPUT("Res: %d\n", res);
+                        if(res == BETTER_RESULT) { /* synonymous with better when called w/ infeasibleBM */
+                            winnerInfo = candidateInfo;
+                            better = 1;
+                            if(compareResultLT(bestFeasible1.result, winnerInfo.result, target, buf.srcSize)) {
+                                bestFeasible1 = winnerInfo;
+                            }
+                        }
+                    }
+                }
+            }   /* for (i = 0; i < varLen; i++) */
+
+            if(better) {
+                continue;
+            }
+
+            for(dist = 2; dist < varLen + 2; dist++) { /* varLen is # dimensions */
+                for(i = 0; i < (1 << varLen) / varLen + 2; i++) {
+                    int res;
+                    CHECKTIME(winnerInfo);
+                    candidateInfo.params = cparam;
+                    /* param error checking already done here */
+                    paramVariation(&candidateInfo.params, mtAll, (U32)dist);
+
+                    res = benchMemo(&candidateInfo.result,
+                                buf, ctx,
+                                sanitizeParams(candidateInfo.params), target,
+                                &winnerInfo.result, mtAll, feas);
+                    DEBUGOUTPUT("Res: %d\n", res);
+                    if (res == BETTER_RESULT) { /* synonymous with better in this case*/
+                        winnerInfo = candidateInfo;
+                        better = 1;
+                        if (compareResultLT(bestFeasible1.result, winnerInfo.result, target, buf.srcSize)) {
+                            bestFeasible1 = winnerInfo;
+                        }
+                        break;
+                    }
+                }
+
+                if (better) {
+                    break;
+                }
+            }   /* for(dist = 2; dist < varLen + 2; dist++) */
+
+            if (!better) { /* infeas -> feas -> stop */
+                if (feas) return winnerInfo;
+                feas = 1;
+                better = 1;
+                winnerInfo = bestFeasible1; /* note with change, bestFeasible may not necessarily be feasible, but if one has been benchmarked, it will be. */
+                DEBUGOUTPUT("Climb Part 2\n");
+            }
+        }
+        winnerInfo = bestFeasible1;
+    }
+
+    return winnerInfo;
+}
+
+/* Optimizes for a fixed strategy */
+
+/* flexible parameters: iterations of failed climbing (or if we do non-random, maybe this is when everything is close to visitied)
+   weight more on visit for bad results, less on good results/more on later results / ones with more failures.
+   allocate memoTable here.
+ */
+static winnerInfo_t optimizeFixedStrategy(
+    const buffers_t buf, const contexts_t ctx,
+    const constraint_t target, paramValues_t paramTarget,
+    const ZSTD_strategy strat,
+    memoTable_t* memoTableArray, const int tries) {
+    int i = 0;
+
+    paramValues_t init;
+    winnerInfo_t winnerInfo, candidateInfo;
+    winnerInfo = initWinnerInfo(emptyParams());
+    /* so climb is given the right fixed strategy */
+    paramTarget.vals[strt_ind] = strat;
+    /* to pass ZSTD_checkCParams */
+    paramTarget = cParamUnsetMin(paramTarget);
+
+    init = paramTarget;
+
+    for(i = 0; i < tries; i++) {
+        DEBUGOUTPUT("Restart\n");
+        do { randomConstrainedParams(&init, memoTableArray, strat); } while(redundantParams(init, target, buf.maxBlockSize));
+        candidateInfo = climbOnce(target, memoTableArray, buf, ctx, init);
+        if(compareResultLT(winnerInfo.result, candidateInfo.result, target, buf.srcSize)) {
+            winnerInfo = candidateInfo;
+            BMK_printWinnerOpt(stdout, CUSTOM_LEVEL, winnerInfo.result, winnerInfo.params, target, buf.srcSize);
+            i = 0;
+            continue;
+        }
+        CHECKTIME(winnerInfo);
+        i++;
+    }
+    return winnerInfo;
+}
+
+/* goes best, best-1, best+1, best-2, ... */
+/* return 0 if nothing remaining */
+static int nextStrategy(const int currentStrategy, const int bestStrategy) {
+    if(bestStrategy <= currentStrategy) {
+        int candidate = 2 * bestStrategy - currentStrategy - 1;
+        if(candidate < 1) {
+            candidate = currentStrategy + 1;
+            if(candidate > (int)ZSTD_btultra) {
+                return 0;
+            } else {
+                return candidate;
+            }
+        } else {
+            return candidate;
+        }
+    } else { /* bestStrategy >= currentStrategy */
+        int candidate = 2 * bestStrategy - currentStrategy;
+        if(candidate > (int)ZSTD_btultra) {
+            candidate = currentStrategy - 1;
+            if(candidate < 1) {
+                return 0;
+            } else {
+                return candidate;
+            }
+        } else {
+            return candidate;
+        }
+    }
+}
+
+/* experiment with playing with this and decay value */
+
+/* main fn called when using --optimize */
+/* Does strategy selection by benchmarking default compression levels
+ * then optimizes by strategy, starting with the best one and moving
+ * progressively moving further away by number
+ * args:
+ * fileNamesTable - list of files to benchmark
+ * nbFiles - length of fileNamesTable
+ * dictFileName - name of dictionary file if one, else NULL
+ * target - performance constraints (cSpeed, dSpeed, cMem)
+ * paramTarget - parameter constraints (i.e. restriction search space to where strategy = ZSTD_fast)
+ * cLevel - compression level to exceed (all solutions must be > lvl in cSpeed + ratio)
+ */
+
+static int g_maxTries = 5;
+#define TRY_DECAY 1
+
+static int optimizeForSize(const char* const * const fileNamesTable, const size_t nbFiles, const char* dictFileName, constraint_t target, paramValues_t paramTarget,
+    const int cLevelOpt, const int cLevelRun, const U32 memoTableLog)
+{
+    varInds_t varArray [NUM_PARAMS];
+    int ret = 0;
+    const size_t varLen = variableParams(paramTarget, varArray, dictFileName != NULL);
+    winnerInfo_t winner = initWinnerInfo(emptyParams());
+    memoTable_t* allMT = NULL;
+    paramValues_t paramBase;
+    contexts_t ctx;
+    buffers_t buf;
+    g_time = UTIL_getTime();
+
+    if(createBuffers(&buf, fileNamesTable, nbFiles)) {
+        DISPLAY("unable to load files\n");
+        return 1;
+    }
+
+    if(createContexts(&ctx, dictFileName)) {
+        DISPLAY("unable to load dictionary\n");
+        freeBuffers(buf);
+        return 2;
+    }
+
+    if(nbFiles == 1) {
+        DISPLAYLEVEL(2, "Loading %s...       \r", fileNamesTable[0]);
+    } else {
+        DISPLAYLEVEL(2, "Loading %lu Files...       \r", (unsigned long)nbFiles);
+    }
+
+    /* sanitize paramTarget */
+    optimizerAdjustInput(&paramTarget, buf.maxBlockSize);
+    paramBase = cParamUnsetMin(paramTarget);
+
+    allMT = createMemoTableArray(paramTarget, varArray, varLen, memoTableLog);
+
+    if (!allMT) {
+        DISPLAY("MemoTable Init Error\n");
+        ret = 2;
+        goto _cleanUp;
+    }
+
+    /* default strictnesses */
+    if (g_strictness == PARAM_UNSET) {
+        if(g_optmode) {
+            g_strictness = 100;
+        } else {
+            g_strictness = 90;
+        }
+    } else {
+        if(0 >= g_strictness || g_strictness > 100) {
+            DISPLAY("Strictness Outside of Bounds\n");
+            ret = 4;
+            goto _cleanUp;
+        }
+    }
+
+    /* use level'ing mode instead of normal target mode */
+    if (g_optmode) {
+        winner.params = cParamsToPVals(ZSTD_getCParams(cLevelOpt, buf.maxBlockSize, ctx.dictSize));
+        if(BMK_benchParam(&winner.result, buf, ctx, winner.params)) {
+            ret = 3;
+            goto _cleanUp;
+        }
+
+        g_lvltarget = winner.result;
+        g_lvltarget.cSpeed *= ((double)g_strictness) / 100;
+        g_lvltarget.dSpeed *= ((double)g_strictness) / 100;
+        g_lvltarget.cSize /= ((double)g_strictness) / 100;
+
+        target.cSpeed = (U32)g_lvltarget.cSpeed;
+        target.dSpeed = (U32)g_lvltarget.dSpeed;
+
+        BMK_printWinnerOpt(stdout, cLevelOpt, winner.result, winner.params, target, buf.srcSize);
+    }
+
+    /* Don't want it to return anything worse than the best known result */
+    if (g_singleRun) {
+        BMK_benchResult_t res;
+        g_params = adjustParams(overwriteParams(cParamsToPVals(ZSTD_getCParams(cLevelRun, buf.maxBlockSize, ctx.dictSize)), g_params), buf.maxBlockSize, ctx.dictSize);
+        if (BMK_benchParam(&res, buf, ctx, g_params)) {
+            ret = 45;
+            goto _cleanUp;
+        }
+        if(compareResultLT(winner.result, res, relaxTarget(target), buf.srcSize)) {
+            winner.result = res;
+            winner.params = g_params;
+        }
+    }
+
+    /* bench */
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    if(nbFiles == 1) {
+        DISPLAYLEVEL(2, "optimizing for %s", fileNamesTable[0]);
+    } else {
+        DISPLAYLEVEL(2, "optimizing for %lu Files", (unsigned long)nbFiles);
+    }
+
+    if(target.cSpeed != 0) { DISPLAYLEVEL(2," - limit compression speed %u MB/s", target.cSpeed >> 20); }
+    if(target.dSpeed != 0) { DISPLAYLEVEL(2, " - limit decompression speed %u MB/s", target.dSpeed >> 20); }
+    if(target.cMem != (U32)-1) { DISPLAYLEVEL(2, " - limit memory %u MB", target.cMem >> 20); }
+
+    DISPLAYLEVEL(2, "\n");
+    findClockGranularity();
+
+    {   paramValues_t CParams;
 
         /* find best solution from default params */
-        {   const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
-            int i;
-            for (i=1; i<=maxSeeds; i++) {
-                ZSTD_compressionParameters const CParams = ZSTD_getCParams(i, blockSize, 0);
-                BMK_benchParam(&candidate, origBuff, benchedSize, ctx, CParams);
-                if (candidate.cSpeed < targetSpeed)
-                    break;
-                if ( (candidate.cSize < winner.result.cSize)
-                   | ((candidate.cSize == winner.result.cSize) & (candidate.cSpeed > winner.result.cSpeed)) )
-                {
-                    winner.params = CParams;
-                    winner.result = candidate;
-                    BMK_printWinner(stdout, i, winner.result, winner.params, benchedSize);
-            }   }
-        }
-        BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
-        BMK_translateAdvancedParams(winner.params);
+        {
+            /* strategy selection */
+            const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
+            DEBUGOUTPUT("Strategy Selection\n");
+            if(paramTarget.vals[strt_ind] == PARAM_UNSET) {
+                BMK_benchResult_t candidate;
+                int i;
+                for (i=1; i<=maxSeeds; i++) {
+                    int ec;
+                    CParams = overwriteParams(cParamsToPVals(ZSTD_getCParams(i, buf.maxBlockSize, ctx.dictSize)), paramTarget);
+                    ec = BMK_benchParam(&candidate, buf, ctx, CParams);
+                    BMK_printWinnerOpt(stdout, i, candidate, CParams, target, buf.srcSize);
 
-        /* start tests */
-        {   time_t const grillStart = time(NULL);
-            do {
-                ZSTD_compressionParameters params = winner.params;
-                paramVariation(&params);
-                if ((FUZ_rand(&g_rand) & 31) == 3) params = randomParams();  /* totally random config to improve search space */
-                params = ZSTD_adjustCParams(params, blockSize, 0);
+                    if(!ec && compareResultLT(winner.result, candidate, relaxTarget(target), buf.srcSize)) {
+                        winner.result = candidate;
+                        winner.params = CParams;
+                    }
 
-                /* exclude faster if already played set of params */
-                if (FUZ_rand(&g_rand) & ((1 << NB_TESTS_PLAYED(params))-1)) continue;
-
-                /* test */
-                NB_TESTS_PLAYED(params)++;
-                BMK_benchParam(&candidate, origBuff, benchedSize, ctx, params);
-
-                /* improvement found => new winner */
-                if ( (candidate.cSpeed > targetSpeed)
-                   & ( (candidate.cSize < winner.result.cSize)
-                     | ((candidate.cSize == winner.result.cSize) & (candidate.cSpeed > winner.result.cSpeed)) )  )
-                {
-                    winner.params = params;
-                    winner.result = candidate;
-                    BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
-                    BMK_translateAdvancedParams(winner.params);
+                    CHECKTIMEGT(ret, 0, _displayCleanUp); /* if pass time limit, stop */
+                    /* if the current params are too slow, just stop. */
+                    if(target.cSpeed > candidate.cSpeed * 3 / 2) { break; }
                 }
-            } while (BMK_timeSpan(grillStart) < g_grillDuration_s);
+
+                BMK_printWinnerOpt(stdout, CUSTOM_LEVEL, winner.result, winner.params, target, buf.srcSize);
+            }
         }
 
+        DEBUGOUTPUT("Real Opt\n");
+        /* start 'real' optimization */
+        {
+            int bestStrategy = (int)winner.params.vals[strt_ind];
+            if(paramTarget.vals[strt_ind] == PARAM_UNSET) {
+                int st = bestStrategy;
+                int tries = g_maxTries;
+
+                {
+                    /* one iterations of hill climbing with the level-defined parameters. */
+                    winnerInfo_t w1 = climbOnce(target, allMT, buf, ctx, winner.params);
+                    if(compareResultLT(winner.result, w1.result, target, buf.srcSize)) {
+                        winner = w1;
+                    }
+                    CHECKTIMEGT(ret, 0, _displayCleanUp);
+                }
+
+                while(st && tries > 0) {
+                    winnerInfo_t wc;
+                    DEBUGOUTPUT("StrategySwitch: %s\n", g_stratName[st]);
+
+                    wc = optimizeFixedStrategy(buf, ctx, target, paramBase, st, allMT, tries);
+
+                    if(compareResultLT(winner.result, wc.result, target, buf.srcSize)) {
+                        winner = wc;
+                        tries = g_maxTries;
+                        bestStrategy = st;
+                    } else {
+                        st = nextStrategy(st, bestStrategy);
+                        tries -= TRY_DECAY;
+                    }
+                    CHECKTIMEGT(ret, 0, _displayCleanUp);
+                }
+            } else {
+                winner = optimizeFixedStrategy(buf, ctx, target, paramBase, paramTarget.vals[strt_ind], allMT, g_maxTries);
+            }
+
+        }
+
+        /* no solution found */
+        if(winner.result.cSize == (size_t)-1) {
+            ret = 1;
+            DISPLAY("No feasible solution found\n");
+            goto _cleanUp;
+        }
         /* end summary */
-        BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
-        DISPLAY("grillParams size - optimizer completed \n");
+_displayCleanUp:
+        if(g_displayLevel >= 0) { BMK_displayOneResult(stdout, winner, buf.srcSize); }
+        BMK_translateAdvancedParams(stdout, winner.params);
+        DISPLAYLEVEL(1, "grillParams size - optimizer completed \n");
 
-        /* clean up*/
-        ZSTD_freeCCtx(ctx);
     }
-
-    free(origBuff);
-    return 0;
+_cleanUp:
+    freeContexts(ctx);
+    freeBuffers(buf);
+    freeMemoTableArray(allMT);
+    return ret;
 }
 
+/*-************************************
+*  CLI parsing functions
+**************************************/
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ * from zstdcli.c
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+static void errorOut(const char* msg)
+{
+    DISPLAY("%s \n", msg); exit(1);
+}
+
+/*! readU32FromChar() :
+ * @return : unsigned integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function will exit() program if digit sequence overflows */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned sign = 1;
+    unsigned result = 0;
+    if(**stringPtr == '-') { sign = (unsigned)-1; (*stringPtr)++; }
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) errorOut(errorMsg);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) errorOut(errorMsg);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) errorOut(errorMsg);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result * sign;
+}
+
+static double readDoubleFromChar(const char** stringPtr)
+{
+    double result = 0, divide = 10;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if(**stringPtr!='.') {
+        return result;
+    }
+    (*stringPtr)++;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        result += (double)(**stringPtr - '0') / divide, divide *= 10, (*stringPtr)++ ;
+    }
+    return result;
+}
 
 static int usage(const char* exename)
 {
@@ -857,12 +2453,16 @@ static int usage(const char* exename)
 static int usage_advanced(void)
 {
     DISPLAY( "\nAdvanced options :\n");
-    DISPLAY( " -T#    : set level 1 speed objective \n");
-    DISPLAY( " -B#    : cut input into blocks of size # (default : single block) \n");
-    DISPLAY( " -i#    : iteration loops [1-9](default : %i) \n", NBLOOPS);
-    DISPLAY( " -O#    : find Optimized parameters for # MB/s compression speed (default : 0) \n");
-    DISPLAY( " -S     : Single run \n");
-    DISPLAY( " -P#    : generated sample compressibility (default : %.1f%%) \n", COMPRESSIBILITY_DEFAULT * 100);
+    DISPLAY( " -T#          : set level 1 speed objective \n");
+    DISPLAY( " -B#          : cut input into blocks of size # (default : single block) \n");
+    DISPLAY( " --optimize=  : same as -O with more verbose syntax (see README.md)\n");
+    DISPLAY( " -S           : Single run \n");
+    DISPLAY( " --zstd       : Single run, parameter selection same as zstdcli \n");
+    DISPLAY( " -P#          : generated sample compressibility (default : %.1f%%) \n", COMPRESSIBILITY_DEFAULT * 100);
+    DISPLAY( " -t#          : Caps runtime of operation in seconds (default : %u seconds (%.1f hours)) \n", g_timeLimit_s, (double)g_timeLimit_s / 3600);
+    DISPLAY( " -v           : Prints Benchmarking output\n");
+    DISPLAY( " -D           : Next argument dictionary file\n");
+    DISPLAY( " -s           : Seperate Files\n");
     return 0;
 }
 
@@ -873,37 +2473,135 @@ static int badusage(const char* exename)
     return 1;
 }
 
+#define PARSE_SUB_ARGS(stringLong, stringShort, variable) { if (longCommandWArg(&argument, stringLong) || longCommandWArg(&argument, stringShort)) { variable = readU32FromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; } }
+/* 1 if successful parse, 0 otherwise */
+static int parse_params(const char** argptr, paramValues_t* pv) {
+    int matched = 0;
+    const char* argOrig = *argptr;
+    varInds_t v;
+    for(v = 0; v < NUM_PARAMS; v++) {
+        if(longCommandWArg(argptr,g_shortParamNames[v]) || longCommandWArg(argptr, g_paramNames[v])) {
+            if(**argptr == '=') {
+                (*argptr)++;
+                pv->vals[v] = readU32FromChar(argptr);
+                matched = 1;
+                break;
+            }
+        }
+        /* reset and try again */
+        *argptr = argOrig;
+    }
+    return matched;
+}
+
+/*-************************************
+*  Main
+**************************************/
+
 int main(int argc, const char** argv)
 {
     int i,
         filenamesStart=0,
         result;
     const char* exename=argv[0];
-    const char* input_filename=0;
-    U32 optimizer = 0;
+    const char* input_filename = NULL;
+    const char* dictFileName = NULL;
     U32 main_pause = 0;
-    U32 targetSpeed = 0;
+    int cLevelOpt = 0, cLevelRun = 0;
+    int seperateFiles = 0;
+    double compressibility = COMPRESSIBILITY_DEFAULT;
+    U32 memoTableLog = PARAM_UNSET;
+    constraint_t target = { 0, 0, (U32)-1 };
 
-    /* checks */
-    if (NB_LEVELS_TRACKED <= ZSTD_maxCLevel()) {
-        DISPLAY("Error : NB_LEVELS_TRACKED <= ZSTD_maxCLevel() \n");
-        exit(1);
-    }
+    paramValues_t paramTarget = emptyParams();
+    g_params = emptyParams();
 
-    /* Welcome message */
-    DISPLAY(WELCOME_MESSAGE);
-
-    if (argc<1) { badusage(exename); return 1; }
+    assert(argc>=1);   /* for exename */
 
     for(i=1; i<argc; i++) {
         const char* argument = argv[i];
-
-        if(!argument) continue;   /* Protection if argument empty */
+        DEBUGOUTPUT("%d: %s\n", i, argument);
+        assert(argument != NULL);
 
         if(!strcmp(argument,"--no-seed")) { g_noSeed = 1; continue; }
 
+        if (longCommandWArg(&argument, "--optimize=")) {
+            g_optimizer = 1;
+            for ( ; ;) {
+                if(parse_params(&argument, &paramTarget)) { if(argument[0] == ',') { argument++; continue; } else break; }
+                PARSE_SUB_ARGS("compressionSpeed=" ,  "cSpeed=", target.cSpeed);
+                PARSE_SUB_ARGS("decompressionSpeed=", "dSpeed=", target.dSpeed);
+                PARSE_SUB_ARGS("compressionMemory=" , "cMem=", target.cMem);
+                PARSE_SUB_ARGS("strict=", "stc=", g_strictness);
+                PARSE_SUB_ARGS("maxTries=", "tries=", g_maxTries);
+                PARSE_SUB_ARGS("memoLimitLog=", "memLog=", memoTableLog);
+                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { cLevelOpt = readU32FromChar(&argument); g_optmode = 1; if (argument[0]==',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "speedForRatio=") || longCommandWArg(&argument, "speedRatio=")) { g_ratioMultiplier = readDoubleFromChar(&argument); if (argument[0]==',') { argument++; continue; } else break; }
+
+                DISPLAY("invalid optimization parameter \n");
+                return 1;
+            }
+
+            if (argument[0] != 0) {
+                DISPLAY("invalid --optimize= format\n");
+                return 1; /* check the end of string */
+            }
+            continue;
+        } else if (longCommandWArg(&argument, "--zstd=")) {
         /* Decode command (note : aggregated commands are allowed) */
-        if (argument[0]=='-') {
+            g_singleRun = 1;
+            for ( ; ;) {
+                if(parse_params(&argument, &g_params)) { if(argument[0] == ',') { argument++; continue; } else break; }
+                if (longCommandWArg(&argument, "level=") || longCommandWArg(&argument, "lvl=")) { cLevelRun = readU32FromChar(&argument); g_params = emptyParams(); if (argument[0]==',') { argument++; continue; } else break; }
+
+                DISPLAY("invalid compression parameter \n");
+                return 1;
+            }
+
+            if (argument[0] != 0) {
+                DISPLAY("invalid --zstd= format\n");
+                return 1; /* check the end of string */
+            }
+            continue;
+            /* if not return, success */
+
+        } else if (longCommandWArg(&argument, "--display=")) {
+            /* Decode command (note : aggregated commands are allowed) */
+            memset(g_silenceParams, 1, sizeof(g_silenceParams));
+            for ( ; ;) {
+                int found = 0;
+                varInds_t v;
+                for(v = 0; v < NUM_PARAMS; v++) {
+                    if(longCommandWArg(&argument, g_shortParamNames[v]) || longCommandWArg(&argument, g_paramNames[v])) {
+                        g_silenceParams[v] = 0;
+                        found = 1;
+                    }
+                }
+                if(longCommandWArg(&argument, "compressionParameters") || longCommandWArg(&argument, "cParams")) {
+                    for(v = 0; v <= strt_ind; v++) {
+                        g_silenceParams[v] = 0;
+                    }
+                    found = 1;
+                }
+
+
+                if(found) {
+                    if(argument[0]==',') {
+                        continue;
+                    } else {
+                        break;
+                    }
+                }
+                DISPLAY("invalid parameter name parameter \n");
+                return 1;
+            }
+
+            if (argument[0] != 0) {
+                DISPLAY("invalid --display format\n");
+                return 1; /* check the end of string */
+            }
+            continue;
+        } else if (argument[0]=='-') {
             argument++;
 
             while (argument[0]!=0) {
@@ -917,114 +2615,110 @@ int main(int argc, const char** argv)
                     /* Pause at the end (hidden option) */
                 case 'p': main_pause = 1; argument++; break;
 
-                    /* Modify Nb Iterations */
-                case 'i':
-                    argument++;
-                    if ((argument[0] >='0') & (argument[0] <='9'))
-                        g_nbIterations = *argument++ - '0';
-                    break;
-
                     /* Sample compressibility (when no file provided) */
                 case 'P':
                     argument++;
-                    {   U32 proba32 = 0;
-                        while ((argument[0]>= '0') & (argument[0]<= '9'))
-                            proba32 = (proba32*10) + (*argument++ - '0');
-                        g_compressibility = (double)proba32 / 100.;
+                    {   U32 const proba32 = readU32FromChar(&argument);
+                        compressibility = (double)proba32 / 100.;
                     }
                     break;
 
-                case 'O':
-                    argument++;
-                    optimizer=1;
-                    targetSpeed = 0;
-                    while ((*argument >= '0') & (*argument <= '9'))
-                        targetSpeed = (targetSpeed*10) + (*argument++ - '0');
-                    break;
-
                     /* Run Single conf */
                 case 'S':
                     g_singleRun = 1;
                     argument++;
-                    g_params = ZSTD_getCParams(2, g_blockSize, 0);
                     for ( ; ; ) {
                         switch(*argument)
                         {
                         case 'w':
-                            g_params.windowLog = 0;
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.windowLog *= 10, g_params.windowLog += *argument++ - '0';
+                            g_params.vals[wlog_ind] = readU32FromChar(&argument);
                             continue;
                         case 'c':
-                            g_params.chainLog = 0;
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.chainLog *= 10, g_params.chainLog += *argument++ - '0';
+                            g_params.vals[clog_ind] = readU32FromChar(&argument);
                             continue;
                         case 'h':
-                            g_params.hashLog = 0;
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.hashLog *= 10, g_params.hashLog += *argument++ - '0';
+                            g_params.vals[hlog_ind] = readU32FromChar(&argument);
                             continue;
                         case 's':
-                            g_params.searchLog = 0;
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.searchLog *= 10, g_params.searchLog += *argument++ - '0';
+                            g_params.vals[slog_ind] = readU32FromChar(&argument);
                             continue;
                         case 'l':  /* search length */
-                            g_params.searchLength = 0;
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.searchLength *= 10, g_params.searchLength += *argument++ - '0';
+                            g_params.vals[slen_ind] = readU32FromChar(&argument);
                             continue;
                         case 't':  /* target length */
-                            g_params.targetLength = 0;
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.targetLength *= 10, g_params.targetLength += *argument++ - '0';
+                            g_params.vals[tlen_ind] = readU32FromChar(&argument);
                             continue;
                         case 'S':  /* strategy */
                             argument++;
-                            while ((*argument>= '0') && (*argument<='9'))
-                                g_params.strategy = (ZSTD_strategy)(*argument++ - '0');
+                            g_params.vals[strt_ind] = readU32FromChar(&argument);
+                            continue;
+                        case 'f':  /* forceAttachDict */
+                            argument++;
+                            g_params.vals[fadt_ind] = readU32FromChar(&argument);
                             continue;
                         case 'L':
-                            {   int cLevel = 0;
-                                argument++;
-                                while ((*argument>= '0') && (*argument<='9'))
-                                    cLevel *= 10, cLevel += *argument++ - '0';
-                                g_params = ZSTD_getCParams(cLevel, g_blockSize, 0);
+                            {   argument++;
+                                cLevelRun = readU32FromChar(&argument);
+                                g_params = emptyParams();
                                 continue;
                             }
                         default : ;
                         }
                         break;
                     }
+
                     break;
 
                     /* target level1 speed objective, in MB/s */
                 case 'T':
                     argument++;
-                    g_target = 0;
-                    while ((*argument >= '0') && (*argument <= '9'))
-                        g_target = (g_target*10) + (*argument++ - '0');
+                    g_target = readU32FromChar(&argument);
                     break;
 
                     /* cut input into blocks */
                 case 'B':
-                    g_blockSize = 0;
                     argument++;
-                    while ((*argument >='0') & (*argument <='9'))
-                        g_blockSize = (g_blockSize*10) + (*argument++ - '0');
-                    if (*argument=='K') g_blockSize<<=10, argument++;  /* allows using KB notation */
-                    if (*argument=='M') g_blockSize<<=20, argument++;
-                    if (*argument=='B') argument++;
+                    g_blockSize = readU32FromChar(&argument);
                     DISPLAY("using %u KB block size \n", g_blockSize>>10);
                     break;
 
+                    /* caps runtime (in seconds) */
+                case 't':
+                    argument++;
+                    g_timeLimit_s = readU32FromChar(&argument);
+                    break;
+
+                case 's':
+                    argument++;
+                    seperateFiles = 1;
+                    break;
+
+                case 'q':
+                    while (argument[0] == 'q') { argument++; g_displayLevel--; }
+                    break;
+
+                case 'v':
+                    while (argument[0] == 'v') { argument++; g_displayLevel++; }
+                    break;
+
+                /* load dictionary file (only applicable for optimizer rn) */
+                case 'D':
+                    if(i == argc - 1) { /* last argument, return error. */
+                        DISPLAY("Dictionary file expected but not given : %d\n", i);
+                        return 1;
+                    } else {
+                        i++;
+                        dictFileName = argv[i];
+                        argument += strlen(argument);
+                    }
+                    break;
+
                     /* Unknown command */
                 default : return badusage(exename);
                 }
@@ -1036,13 +2730,34 @@ int main(int argc, const char** argv)
         if (!input_filename) { input_filename=argument; filenamesStart=i; continue; }
     }
 
-    if (filenamesStart==0)
-        result = benchSample();
-    else {
-        if (optimizer)
-            result = optimizeForSize(input_filename, targetSpeed);
-        else
-            result = benchFiles(argv+filenamesStart, argc-filenamesStart);
+    /* Welcome message */
+    DISPLAYLEVEL(2, WELCOME_MESSAGE);
+
+    if (filenamesStart==0) {
+        if (g_optimizer) {
+            DISPLAY("Optimizer Expects File\n");
+            return 1;
+        } else {
+            result = benchSample(compressibility, cLevelRun);
+        }
+    } else {
+        if(seperateFiles) {
+            for(i = 0; i < argc - filenamesStart; i++) {
+                if (g_optimizer) {
+                    result = optimizeForSize(argv+filenamesStart + i, 1, dictFileName, target, paramTarget, cLevelOpt, cLevelRun, memoTableLog);
+                    if(result) { DISPLAY("Error on File %d", i); return result; }
+                } else {
+                    result = benchFiles(argv+filenamesStart + i, 1, dictFileName, cLevelRun);
+                    if(result) { DISPLAY("Error on File %d", i); return result; }
+                }
+            }
+        } else {
+            if (g_optimizer) {
+                result = optimizeForSize(argv+filenamesStart, argc-filenamesStart, dictFileName, target, paramTarget, cLevelOpt, cLevelRun, memoTableLog);
+            } else {
+                result = benchFiles(argv+filenamesStart, argc-filenamesStart, dictFileName, cLevelRun);
+            }
+        }
     }
 
     if (main_pause) { int unused; printf("press enter...\n"); unused = getchar(); (void)unused; }
diff --git a/tests/playTests.sh b/tests/playTests.sh
index 41d8263b6a67..b86a0dc40cb2 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -48,6 +48,12 @@ fileRoundTripTest() {
     $DIFF -q tmp.md5.1 tmp.md5.2
 }
 
+truncateLastByte() {
+	dd bs=1 count=$(($(wc -c < "$1") - 1)) if="$1" status=none
+}
+
+UNAME=$(uname)
+
 isTerminal=false
 if [ -t 0 ] && [ -t 1 ]
 then
@@ -56,7 +62,10 @@ fi
 
 isWindows=false
 INTOVOID="/dev/null"
-DEVDEVICE="/dev/zero"
+case "$UNAME" in
+  GNU) DEVDEVICE="/dev/random" ;;
+  *) DEVDEVICE="/dev/zero" ;;
+esac
 case "$OS" in
   Windows*)
     isWindows=true
@@ -65,10 +74,10 @@ case "$OS" in
     ;;
 esac
 
-UNAME=$(uname)
 case "$UNAME" in
   Darwin) MD5SUM="md5 -r" ;;
   FreeBSD) MD5SUM="gmd5sum" ;;
+  OpenBSD) MD5SUM="md5" ;;
   *) MD5SUM="md5sum" ;;
 esac
 
@@ -94,6 +103,7 @@ else
 fi
 
 
+
 $ECHO "\n===>  simple tests "
 
 ./datagen > tmp
@@ -103,10 +113,15 @@ $ECHO "test : basic decompression"
 $ZSTD -df tmp.zst                 # trivial decompression case (overwrites tmp)
 $ECHO "test : too large compression level => auto-fix"
 $ZSTD -99 -f tmp  # too large compression level, automatic sized down
+$ZSTD -5000000000 -f tmp && die "too large numeric value : must fail"
 $ECHO "test : --fast aka negative compression levels"
 $ZSTD --fast -f tmp  # == -1
 $ZSTD --fast=3 -f tmp  # == -3
-$ZSTD --fast=200000 -f tmp  # == no compression
+$ZSTD --fast=200000 -f tmp  # too low compression level, automatic fixed
+$ZSTD --fast=5000000000 -f tmp && die "too large numeric value : must fail"
+$ZSTD -c --fast=0 tmp > $INTOVOID && die "--fast must not accept value 0"
+$ECHO "test : too large numeric argument"
+$ZSTD --fast=9999999999 -f tmp  && die "should have refused numeric value"
 $ECHO "test : compress to stdout"
 $ZSTD tmp -c > tmpCompressed
 $ZSTD tmp --stdout > tmpCompressed       # long command format
@@ -165,6 +180,8 @@ chmod 400 tmpro.zst
 $ZSTD -q tmpro && die "should have refused to overwrite read-only file"
 $ZSTD -q -f tmpro
 rm -f tmpro tmpro.zst
+
+
 $ECHO "test : file removal"
 $ZSTD -f --rm tmp
 test ! -f tmp  # tmp should no longer be present
@@ -175,12 +192,23 @@ $ECHO hello > tmp
 $ZSTD tmp -f -o "$DEVDEVICE" 2>tmplog > "$INTOVOID"
 grep -v "Refusing to remove non-regular file" tmplog
 rm -f tmplog
-$ZSTD tmp -f -o "$INTONULL" 2>&1 | grep -v "Refusing to remove non-regular file"
+$ZSTD tmp -f -o "$INTOVOID" 2>&1 | grep -v "Refusing to remove non-regular file"
 $ECHO "test : --rm on stdin"
 $ECHO a | $ZSTD --rm > $INTOVOID   # --rm should remain silent
 rm tmp
 $ZSTD -f tmp && die "tmp not present : should have failed"
 test ! -f tmp.zst  # tmp.zst should not be created
+$ECHO "test : -d -f do not delete destination when source is not present"
+touch tmp    # create destination file
+$ZSTD -d -f tmp.zst && die "attempt to decompress a non existing file"
+test -f tmp  # destination file should still be present
+$ECHO "test : -f do not delete destination when source is not present"
+rm tmp         # erase source file
+touch tmp.zst  # create destination file
+$ZSTD -f tmp && die "attempt to compress a non existing file"
+test -f tmp.zst  # destination file should still be present
+rm tmp*
+
 
 $ECHO "test : compress multiple files"
 $ECHO hello > tmp1
@@ -258,7 +286,7 @@ rm ./*.tmp ./*.zstd
 $ECHO "frame concatenation tests completed"
 
 
-if [ "$isWindows" = false ] && [ "$UNAME" != 'SunOS' ] ; then
+if [ "$isWindows" = false ] && [ "$UNAME" != 'SunOS' ] && [ "$UNAME" != "OpenBSD" ] ; then
 $ECHO "\n**** flush write error test **** "
 
 $ECHO "$ECHO foo | $ZSTD > /dev/full"
@@ -395,28 +423,54 @@ $ECHO "Hello World" > tmp
 $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source"
 ./datagen -P0 -g10M > tmp
 $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise"
-rm tmp*
+$ECHO "- Test -o before --train"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 
-$ECHO "\n===>  cover dictionary builder : advanced options "
+$ECHO "\n===>  fastCover dictionary builder : advanced options "
 
 TESTFILE=../programs/zstdcli.c
 ./datagen > tmpDict
 $ECHO "- Create first dictionary"
-$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c -o tmpDict
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c -o tmpDict
 cp $TESTFILE tmp
 $ZSTD -f tmp -D tmpDict
 $ZSTD -d tmp.zst -D tmpDict -fo result
 $DIFF $TESTFILE result
 $ECHO "- Create second (different) dictionary"
-$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD --train-fastcover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
 $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
 $ECHO "- Create dictionary with short dictID"
-$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
+$ZSTD --train-fastcover=k=46,d=8,f=15,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
-$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
-rm tmp*
+$ZSTD --train-fastcover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
+$ZSTD --train-fastcover=split=90 -r *.c ../programs/*.c
+$ZSTD --train-fastcover=split=80 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using all samples for both training and testing"
+$ZSTD --train-fastcover=split=100 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using f=16"
+$ZSTD --train-fastcover=f=16 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=2"
+$ZSTD --train-fastcover=accel=2 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using accel=10"
+$ZSTD --train-fastcover=accel=10 -r *.c ../programs/*.c
+$ECHO "- Create dictionary with multithreading"
+$ZSTD --train-fastcover -T4 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-fastcover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-fastcover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-fastcover *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
+
 
 $ECHO "\n===>  legacy dictionary builder "
 
@@ -436,7 +490,13 @@ $ZSTD --train-legacy -s5 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
 $ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
-rm tmp*
+$ECHO "- Test -o before --train-legacy"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-legacy *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-legacy *.c ../programs/*.c
+test -f dictionary
+rm tmp* dictionary
 
 
 $ECHO "\n===>  integrity tests "
@@ -482,6 +542,12 @@ $ZSTD -bi0 --fast tmp1
 $ECHO "with recursive and quiet modes"
 $ZSTD -rqi1b1e2 tmp1
 
+$ECHO "\n===>  zstd compatibility tests "
+
+./datagen > tmp
+rm -f tmp.zst
+$ZSTD --format=zstd -f tmp
+test -f tmp.zst
 
 $ECHO "\n===>  gzip compatibility tests "
 
@@ -513,12 +579,18 @@ if [ $GZIPMODE -eq 1 ]; then
     $ZSTD -f --format=gzip tmp
     $ZSTD -f tmp
     cat tmp.gz tmp.zst tmp.gz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "gzip mode not supported"
 fi
 
+if [ $GZIPMODE -eq 1 ]; then
+    ./datagen > tmp
+    rm -f tmp.zst
+    $ZSTD --format=gzip --format=zstd -f tmp
+    test -f tmp.zst
+fi
 
 $ECHO "\n===>  xz compatibility tests "
 
@@ -527,16 +599,16 @@ $ZSTD --format=xz -V || LZMAMODE=0
 if [ $LZMAMODE -eq 1 ]; then
     $ECHO "xz support detected"
     XZEXE=1
-    xz -V && lzma -V || XZEXE=0
+    xz -Q -V && lzma -Q -V || XZEXE=0
     if [ $XZEXE -eq 1 ]; then
         $ECHO "Testing zstd xz and lzma support"
         ./datagen > tmp
         $ZSTD --format=lzma -f tmp
         $ZSTD --format=xz -f tmp
-        xz -t -v tmp.xz
-        xz -t -v tmp.lzma
-        xz -f -k tmp
-        lzma -f -k --lzma1 tmp
+        xz -Q -t -v tmp.xz
+        xz -Q -t -v tmp.lzma
+        xz -Q -f -k tmp
+        lzma -Q -f -k --lzma1 tmp
         $ZSTD -d -f -v tmp.xz
         $ZSTD -d -f -v tmp.lzma
         rm tmp*
@@ -548,13 +620,13 @@ if [ $LZMAMODE -eq 1 ]; then
         $ECHO "Testing xz and lzma symlinks"
         ./datagen > tmp
         ./xz tmp
-        xz -d tmp.xz
+        xz -Q -d tmp.xz
         ./lzma tmp
-        lzma -d tmp.lzma
+        lzma -Q -d tmp.lzma
         $ECHO "Testing unxz and unlzma symlinks"
-        xz tmp
+        xz -Q tmp
         ./xz -d tmp.xz
-        lzma tmp
+        lzma -Q tmp
         ./lzma -d tmp.lzma
         rm xz unxz lzma unlzma
         rm tmp*
@@ -574,8 +646,8 @@ if [ $LZMAMODE -eq 1 ]; then
     $ZSTD -f --format=lzma tmp
     $ZSTD -f tmp
     cat tmp.xz tmp.lzma tmp.zst tmp.lzma tmp.xz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
-    head -c -1 tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "xz mode not supported"
@@ -611,12 +683,29 @@ if [ $LZ4MODE -eq 1 ]; then
     $ZSTD -f --format=lz4 tmp
     $ZSTD -f tmp
     cat tmp.lz4 tmp.zst tmp.lz4 tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    truncateLastByte tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "lz4 mode not supported"
 fi
 
+$ECHO "\n===> suffix list test"
+
+! $ZSTD -d tmp.abc 2> tmplg
+
+if [ $GZIPMODE -ne 1 ]; then
+    grep ".gz" tmplg > $INTOVOID && die "Unsupported suffix listed"
+fi
+
+if [ $LZMAMODE -ne 1 ]; then
+    grep ".lzma" tmplg > $INTOVOID && die "Unsupported suffix listed"
+    grep ".xz" tmplg > $INTOVOID && die "Unsupported suffix listed"
+fi
+
+if [ $LZ4MODE -ne 1 ]; then
+    grep ".lz4" tmplg > $INTOVOID && die "Unsupported suffix listed"
+fi
+
 $ECHO "\n===>  zstd round-trip tests "
 
 roundTripTest
@@ -650,6 +739,25 @@ then
 
     $ECHO "\n===>  zstdmt long distance matching round-trip tests "
     roundTripTest -g8M "3 --long=24 -T2"
+
+    $ECHO "\n===>  ovLog tests "
+    ./datagen -g2MB > tmp
+    refSize=$($ZSTD tmp -6 -c --zstd=wlog=18         | wc -c)
+    ov9Size=$($ZSTD tmp -6 -c --zstd=wlog=18,ovlog=9 | wc -c)
+    ov0Size=$($ZSTD tmp -6 -c --zstd=wlog=18,ovlog=0 | wc -c)
+    if [ $refSize -eq $ov9Size ]; then
+        echo ov9Size should be different from refSize
+        exit 1
+    fi
+    if [ $refSize -eq $ov0Size ]; then
+        echo ov0Size should be different from refSize
+        exit 1
+    fi
+    if [ $ov9Size -ge $ov0Size ]; then
+        echo ov9Size=$ov9Size should be smaller than ov0Size=$ov0Size
+        exit 1
+    fi
+
 else
     $ECHO "\n===>  no multithreading, skipping zstdmt tests "
 fi
@@ -673,23 +781,32 @@ $ZSTD -l *.zst
 $ZSTD -lv *.zst
 
 $ECHO "\n===>  zstd --list/-l error detection tests "
-! $ZSTD -l tmp1 tmp1.zst
-! $ZSTD --list tmp*
-! $ZSTD -lv tmp1*
-! $ZSTD --list -v tmp2 tmp12.zst
+$ZSTD -l tmp1 tmp1.zst && die "-l must fail on non-zstd file"
+$ZSTD --list tmp* && die "-l must fail on non-zstd file"
+$ZSTD -lv tmp1* && die "-l must fail on non-zstd file"
+$ZSTD --list -v tmp2 tmp12.zst && die "-l must fail on non-zstd file"
+
+$ECHO "\n===>  zstd --list/-l errors when presented with stdin / no files"
+$ZSTD -l && die "-l must fail on empty list of files"
+$ZSTD -l - && die "-l does not work on stdin"
+$ZSTD -l < tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l - < tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l - tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l - tmp1.zst < tmp1.zst && die "-l does not work on stdin"
+$ZSTD -l tmp1.zst < tmp2.zst # this will check tmp1.zst, but not tmp2.zst, which is not an error : zstd simply doesn't read stdin in this case. It must not error just because stdin is not a tty
 
 $ECHO "\n===>  zstd --list/-l test with null files "
 ./datagen -g0 > tmp5
 $ZSTD tmp5
 $ZSTD -l tmp5.zst
-! $ZSTD -l tmp5*
+$ZSTD -l tmp5* && die "-l must fail on non-zstd file"
 $ZSTD -lv tmp5.zst | grep "Decompressed Size: 0.00 KB (0 B)"  # check that 0 size is present in header
-! $ZSTD -lv tmp5*
+$ZSTD -lv tmp5* && die "-l must fail on non-zstd file"
 
 $ECHO "\n===>  zstd --list/-l test with no content size field "
 ./datagen -g513K | $ZSTD > tmp6.zst
 $ZSTD -l tmp6.zst
-! $ZSTD -lv tmp6.zst | grep "Decompressed Size:"  # must NOT be present in header
+$ZSTD -lv tmp6.zst | grep "Decompressed Size:"  && die "Field :Decompressed Size: should not be available in this compressed file"
 
 $ECHO "\n===>   zstd --list/-l test with no checksum "
 $ZSTD -f --no-check tmp1
@@ -709,11 +826,22 @@ roundTripTest -g1M -P50 "1 --single-thread --long=29" " --long=28 --memory=512MB
 roundTripTest -g1M -P50 "1 --single-thread --long=29" " --zstd=wlog=28 --memory=512MB"
 
 
+$ECHO "\n===>   adaptive mode "
+roundTripTest -g270000000 " --adapt"
+roundTripTest -g27000000 " --adapt=min=1,max=4"
+$ECHO "===>   test: --adapt must fail on incoherent bounds "
+./datagen > tmp
+$ZSTD -f -vv --adapt=min=10,max=9 tmp && die "--adapt must fail on incoherent bounds"
+
+
 if [ "$1" != "--test-large-data" ]; then
     $ECHO "Skipping large data tests"
     exit 0
 fi
 
+
+#############################################################################
+
 $ECHO "\n===>   large files tests "
 
 roundTripTest -g270000000 1
@@ -768,4 +896,37 @@ else
     $ECHO "\n**** no multithreading, skipping zstdmt tests **** "
 fi
 
-rm tmp*
+
+$ECHO "\n===>  cover dictionary builder : advanced options "
+
+TESTFILE=../programs/zstdcli.c
+./datagen > tmpDict
+$ECHO "- Create first dictionary"
+$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c -o tmpDict
+cp $TESTFILE tmp
+$ZSTD -f tmp -D tmpDict
+$ZSTD -d tmp.zst -D tmpDict -fo result
+$DIFF $TESTFILE result
+$ECHO "- Create second (different) dictionary"
+$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
+$ECHO "- Create dictionary with short dictID"
+$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c --dictID=1 -o tmpDict1
+cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
+$ECHO "- Create dictionary with size limit"
+$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
+$ZSTD --train-cover=split=90 -r *.c ../programs/*.c
+$ZSTD --train-cover=split=80 -r *.c ../programs/*.c
+$ECHO "- Create dictionary using all samples for both training and testing"
+$ZSTD --train-cover=split=100 -r *.c ../programs/*.c
+$ECHO "- Test -o before --train-cover"
+rm -f tmpDict dictionary
+$ZSTD -o tmpDict --train-cover *.c ../programs/*.c
+test -f tmpDict
+$ZSTD --train-cover *.c ../programs/*.c
+test -f dictionary
+rm -f tmp* dictionary
+
+
+rm -f tmp*
diff --git a/tests/poolTests.c b/tests/poolTests.c
index 00ee830154c9..9661b5299e52 100644
--- a/tests/poolTests.c
+++ b/tests/poolTests.c
@@ -15,11 +15,11 @@
 #include <stddef.h>
 #include <stdio.h>
 
-#define ASSERT_TRUE(p)                                                         \
-  do {                                                                         \
-    if (!(p)) {                                                                \
-      return 1;                                                                \
-    }                                                                          \
+#define ASSERT_TRUE(p)                                                       \
+  do {                                                                       \
+    if (!(p)) {                                                              \
+      return 1;                                                              \
+    }                                                                        \
   } while (0)
 #define ASSERT_FALSE(p) ASSERT_TRUE(!(p))
 #define ASSERT_EQ(lhs, rhs) ASSERT_TRUE((lhs) == (rhs))
@@ -32,10 +32,10 @@ struct data {
 
 void fn(void *opaque) {
   struct data *data = (struct data *)opaque;
-  pthread_mutex_lock(&data->mutex);
+  ZSTD_pthread_mutex_lock(&data->mutex);
   data->data[data->i] = data->i;
   ++data->i;
-  pthread_mutex_unlock(&data->mutex);
+  ZSTD_pthread_mutex_unlock(&data->mutex);
 }
 
 int testOrder(size_t numThreads, size_t queueSize) {
@@ -43,25 +43,26 @@ int testOrder(size_t numThreads, size_t queueSize) {
   POOL_ctx *ctx = POOL_create(numThreads, queueSize);
   ASSERT_TRUE(ctx);
   data.i = 0;
-  pthread_mutex_init(&data.mutex, NULL);
-  {
-    size_t i;
+  ZSTD_pthread_mutex_init(&data.mutex, NULL);
+  { size_t i;
     for (i = 0; i < 16; ++i) {
       POOL_add(ctx, &fn, &data);
     }
   }
   POOL_free(ctx);
   ASSERT_EQ(16, data.i);
-  {
-    size_t i;
+  { size_t i;
     for (i = 0; i < data.i; ++i) {
       ASSERT_EQ(i, data.data[i]);
     }
   }
-  pthread_mutex_destroy(&data.mutex);
+  ZSTD_pthread_mutex_destroy(&data.mutex);
   return 0;
 }
 
+
+/* --- test deadlocks --- */
+
 void waitFn(void *opaque) {
   (void)opaque;
   UTIL_sleepMilli(1);
@@ -72,8 +73,7 @@ int testWait(size_t numThreads, size_t queueSize) {
   struct data data;
   POOL_ctx *ctx = POOL_create(numThreads, queueSize);
   ASSERT_TRUE(ctx);
-  {
-    size_t i;
+  { size_t i;
     for (i = 0; i < 16; ++i) {
         POOL_add(ctx, &waitFn, &data);
     }
@@ -82,25 +82,178 @@ int testWait(size_t numThreads, size_t queueSize) {
   return 0;
 }
 
+
+/* --- test POOL_resize() --- */
+
+typedef struct {
+    ZSTD_pthread_mutex_t mut;
+    int val;
+    int max;
+    ZSTD_pthread_cond_t cond;
+} poolTest_t;
+
+void waitLongFn(void *opaque) {
+  poolTest_t* test = (poolTest_t*) opaque;
+  UTIL_sleepMilli(10);
+  ZSTD_pthread_mutex_lock(&test->mut);
+  test->val = test->val + 1;
+  if (test->val == test->max)
+    ZSTD_pthread_cond_signal(&test->cond);
+  ZSTD_pthread_mutex_unlock(&test->mut);
+}
+
+static int testThreadReduction_internal(POOL_ctx* ctx, poolTest_t test)
+{
+    int const nbWaits = 16;
+    UTIL_time_t startTime;
+    U64 time4threads, time2threads;
+
+    test.val = 0;
+    test.max = nbWaits;
+
+    startTime = UTIL_getTime();
+    {   int i;
+        for (i=0; i<nbWaits; i++)
+            POOL_add(ctx, &waitLongFn, &test);
+    }
+    ZSTD_pthread_mutex_lock(&test.mut);
+    ZSTD_pthread_cond_wait(&test.cond, &test.mut);
+    ASSERT_EQ(test.val, nbWaits);
+    ZSTD_pthread_mutex_unlock(&test.mut);
+    time4threads = UTIL_clockSpanNano(startTime);
+
+    ASSERT_EQ( POOL_resize(ctx, 2/*nbThreads*/) , 0 );
+    test.val = 0;
+    startTime = UTIL_getTime();
+    {   int i;
+        for (i=0; i<nbWaits; i++)
+            POOL_add(ctx, &waitLongFn, &test);
+    }
+    ZSTD_pthread_mutex_lock(&test.mut);
+    ZSTD_pthread_cond_wait(&test.cond, &test.mut);
+    ASSERT_EQ(test.val, nbWaits);
+    ZSTD_pthread_mutex_unlock(&test.mut);
+    time2threads = UTIL_clockSpanNano(startTime);
+
+    if (time4threads >= time2threads) return 1;   /* check 4 threads were effectively faster than 2 */
+    return 0;
+}
+
+static int testThreadReduction(void) {
+    int result;
+    poolTest_t test;
+    POOL_ctx* const ctx = POOL_create(4 /*nbThreads*/, 2 /*queueSize*/);
+
+    ASSERT_TRUE(ctx);
+
+    memset(&test, 0, sizeof(test));
+    ASSERT_FALSE( ZSTD_pthread_mutex_init(&test.mut, NULL) );
+    ASSERT_FALSE( ZSTD_pthread_cond_init(&test.cond, NULL) );
+
+    result = testThreadReduction_internal(ctx, test);
+
+    ZSTD_pthread_mutex_destroy(&test.mut);
+    ZSTD_pthread_cond_destroy(&test.cond);
+    POOL_free(ctx);
+
+    return result;
+}
+
+
+/* --- test abrupt ending --- */
+
+typedef struct {
+    ZSTD_pthread_mutex_t mut;
+    int val;
+} abruptEndCanary_t;
+
+void waitIncFn(void *opaque) {
+  abruptEndCanary_t* test = (abruptEndCanary_t*) opaque;
+  UTIL_sleepMilli(10);
+  ZSTD_pthread_mutex_lock(&test->mut);
+  test->val = test->val + 1;
+  ZSTD_pthread_mutex_unlock(&test->mut);
+}
+
+static int testAbruptEnding_internal(abruptEndCanary_t test)
+{
+    int const nbWaits = 16;
+
+    POOL_ctx* const ctx = POOL_create(3 /*numThreads*/, nbWaits /*queueSize*/);
+    ASSERT_TRUE(ctx);
+    test.val = 0;
+
+    {   int i;
+        for (i=0; i<nbWaits; i++)
+            POOL_add(ctx, &waitIncFn, &test);  /* all jobs pushed into queue */
+    }
+    ASSERT_EQ( POOL_resize(ctx, 1 /*numThreads*/) , 0 );   /* downsize numThreads, to try to break end condition */
+
+    POOL_free(ctx);  /* must finish all jobs in queue before giving back control */
+    ASSERT_EQ(test.val, nbWaits);
+    return 0;
+}
+
+static int testAbruptEnding(void) {
+    int result;
+    abruptEndCanary_t test;
+
+    memset(&test, 0, sizeof(test));
+    ASSERT_FALSE( ZSTD_pthread_mutex_init(&test.mut, NULL) );
+
+    result = testAbruptEnding_internal(test);
+
+    ZSTD_pthread_mutex_destroy(&test.mut);
+    return result;
+}
+
+
+
+/* --- test launcher --- */
+
 int main(int argc, const char **argv) {
   size_t numThreads;
+  (void)argc;
+  (void)argv;
+
+  if (POOL_create(0, 1)) {   /* should not be possible */
+    printf("FAIL: should not create POOL with 0 threads\n");
+    return 1;
+  }
+
   for (numThreads = 1; numThreads <= 4; ++numThreads) {
     size_t queueSize;
     for (queueSize = 0; queueSize <= 2; ++queueSize) {
+      printf("queueSize==%u, numThreads=%u \n",
+            (unsigned)queueSize, (unsigned)numThreads);
       if (testOrder(numThreads, queueSize)) {
         printf("FAIL: testOrder\n");
         return 1;
       }
+      printf("SUCCESS: testOrder\n");
       if (testWait(numThreads, queueSize)) {
         printf("FAIL: testWait\n");
         return 1;
       }
+      printf("SUCCESS: testWait\n");
     }
   }
-  printf("PASS: testOrder\n");
-  (void)argc;
-  (void)argv;
-  return (POOL_create(0, 1)) ? printf("FAIL: testInvalid\n"), 1
-                             : printf("PASS: testInvalid\n"), 0;
+
+  if (testThreadReduction()) {
+      printf("FAIL: thread reduction not effective \n");
+      return 1;
+  } else {
+      printf("SUCCESS: thread reduction effective (slower execution) \n");
+  }
+
+  if (testAbruptEnding()) {
+      printf("FAIL: jobs in queue not completed on early end \n");
+      return 1;
+  } else {
+      printf("SUCCESS: all jobs in queue completed on early end \n");
+  }
+
+  printf("PASS: all POOL tests\n");
+
   return 0;
 }
diff --git a/tests/rateLimiter.py b/tests/rateLimiter.py
new file mode 100755
index 000000000000..da0baf01464f
--- /dev/null
+++ b/tests/rateLimiter.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+# ################################################################
+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under both the BSD-style license (found in the
+# LICENSE file in the root directory of this source tree) and the GPLv2 (found
+# in the COPYING file in the root directory of this source tree).
+# ##########################################################################
+
+# Rate limiter, replacement for pv
+# this rate limiter does not "catch up" after a blocking period
+# Limitations:
+# - only accepts limit speed in MB/s
+
+import sys
+import time
+
+MB = 1024 * 1024
+rate = float(sys.argv[1]) * MB
+start = time.time()
+total_read = 0
+
+# sys.stderr.close()  # remove error message, for Ctrl+C
+
+try:
+  buf = " "
+  while len(buf):
+    now = time.time()
+    to_read = max(int(rate * (now - start)), 1)
+    max_buf_size = 1 * MB
+    to_read = min(to_read, max_buf_size)
+    start = now
+
+    buf = sys.stdin.buffer.read(to_read)
+    sys.stdout.buffer.write(buf)
+
+except (KeyboardInterrupt, BrokenPipeError) as e:
+    pass
diff --git a/tests/roundTripCrash.c b/tests/roundTripCrash.c
index 7d937fceebc0..90afcd4b2a8b 100644
--- a/tests/roundTripCrash.c
+++ b/tests/roundTripCrash.c
@@ -212,7 +212,7 @@ static void loadFile(void* buffer, const char* fileName, size_t fileSize)
 static void fileCheck(const char* fileName, int testCCtxParams)
 {
     size_t const fileSize = getFileSize(fileName);
-    void* buffer = malloc(fileSize);
+    void* const buffer = malloc(fileSize + !fileSize /* avoid 0 */);
     if (!buffer) {
         fprintf(stderr, "not enough memory \n");
         exit(4);
diff --git a/tests/symbols.c b/tests/symbols.c
index c0bed2e5d96e..b37082131464 100644
--- a/tests/symbols.c
+++ b/tests/symbols.c
@@ -144,6 +144,8 @@ static const void *symbols[] = {
 /* zdict.h: advanced functions */
   &ZDICT_trainFromBuffer_cover,
   &ZDICT_optimizeTrainFromBuffer_cover,
+  &ZDICT_trainFromBuffer_fastCover,
+  &ZDICT_optimizeTrainFromBuffer_fastCover,
   &ZDICT_finalizeDictionary,
   &ZDICT_trainFromBuffer_legacy,
   &ZDICT_addEntropyTablesFromBuffer,
diff --git a/tests/test-zstd-versions.py b/tests/test-zstd-versions.py
index f2deac1f28da..8e88b869b0d2 100755
--- a/tests/test-zstd-versions.py
+++ b/tests/test-zstd-versions.py
@@ -213,7 +213,7 @@ if __name__ == '__main__':
     print('Retrieve all release tags :')
     os.chdir(clone_dir)
     alltags = get_git_tags() + [head]
-    tags = [t for t in alltags if t >= 'v0.4.0']
+    tags = [t for t in alltags if t >= 'v0.5.0']
     print(tags)
 
     # Build all release zstd
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index b94f282f5802..f47451a3c3d5 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -10,8 +10,8 @@
 
 
 /*-************************************
-*  Compiler specific
-**************************************/
+ *  Compiler specific
+ **************************************/
 #ifdef _MSC_VER    /* Visual Studio */
 #  define _CRT_SECURE_NO_WARNINGS   /* fgets */
 #  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
@@ -20,8 +20,8 @@
 
 
 /*-************************************
-*  Includes
-**************************************/
+ *  Includes
+ **************************************/
 #include <stdlib.h>       /* free */
 #include <stdio.h>        /* fgets, sscanf */
 #include <string.h>       /* strcmp */
@@ -40,8 +40,8 @@
 
 
 /*-************************************
-*  Constants
-**************************************/
+ *  Constants
+ **************************************/
 #define KB *(1U<<10)
 #define MB *(1U<<20)
 #define GB *(1U<<30)
@@ -54,8 +54,8 @@ static const U32 prime32 = 2654435761U;
 
 
 /*-************************************
-*  Display Macros
-**************************************/
+ *  Display Macros
+ **************************************/
 #define DISPLAY(...)          fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...)  if (g_displayLevel>=l) {                     \
                                   DISPLAY(__VA_ARGS__);                    \
@@ -74,8 +74,8 @@ static U64 g_clockTime = 0;
 
 
 /*-*******************************************************
-*  Fuzzer functions
-*********************************************************/
+ *  Check macros
+ *********************************************************/
 #undef MIN
 #undef MAX
 #define MIN(a,b) ((a)<(b)?(a):(b))
@@ -84,7 +84,7 @@ static U64 g_clockTime = 0;
     @return : a 27 bits random value, from a 32-bits `seed`.
     `seed` is also modified */
 #define FUZ_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-unsigned int FUZ_rand(unsigned int* seedPtr)
+static unsigned int FUZ_rand(unsigned int* seedPtr)
 {
     static const U32 prime2 = 2246822519U;
     U32 rand32 = *seedPtr;
@@ -110,10 +110,24 @@ unsigned int FUZ_rand(unsigned int* seedPtr)
           #f, ZSTD_getErrorName(err));                       \
 }
 
+#define CHECK_RET(ret, cond, ...) {                          \
+    if (cond) {                                              \
+        DISPLAY("Error %llu => ", (unsigned long long)ret);  \
+        DISPLAY(__VA_ARGS__);                                \
+        DISPLAY(" (line %u)\n", __LINE__);                   \
+        return ret;                                          \
+}   }
+
+#define CHECK_RET_Z(f) {                                     \
+    size_t const err = f;                                    \
+    CHECK_RET(err, ZSTD_isError(err), "%s : %s ",            \
+          #f, ZSTD_getErrorName(err));                       \
+}
+
 
 /*======================================================
-*   Basic Unit tests
-======================================================*/
+ *   Basic Unit tests
+ *======================================================*/
 
 typedef struct {
     void* start;
@@ -121,34 +135,34 @@ typedef struct {
     size_t filled;
 } buffer_t;
 
-static const buffer_t g_nullBuffer = { NULL, 0 , 0 };
+static const buffer_t kBuffNull = { NULL, 0 , 0 };
+
+static void FUZ_freeDictionary(buffer_t dict)
+{
+    free(dict.start);
+}
 
 static buffer_t FUZ_createDictionary(const void* src, size_t srcSize, size_t blockSize, size_t requestedDictSize)
 {
-    buffer_t dict = { NULL, 0, 0 };
+    buffer_t dict = kBuffNull;
     size_t const nbBlocks = (srcSize + (blockSize-1)) / blockSize;
-    size_t* const blockSizes = (size_t*) malloc(nbBlocks * sizeof(size_t));
-    if (!blockSizes) return dict;
+    size_t* const blockSizes = (size_t*)malloc(nbBlocks * sizeof(size_t));
+    if (!blockSizes) return kBuffNull;
     dict.start = malloc(requestedDictSize);
-    if (!dict.start) { free(blockSizes); return dict; }
+    if (!dict.start) { free(blockSizes); return kBuffNull; }
     {   size_t nb;
         for (nb=0; nb<nbBlocks-1; nb++) blockSizes[nb] = blockSize;
         blockSizes[nbBlocks-1] = srcSize - (blockSize * (nbBlocks-1));
     }
     {   size_t const dictSize = ZDICT_trainFromBuffer(dict.start, requestedDictSize, src, blockSizes, (unsigned)nbBlocks);
         free(blockSizes);
-        if (ZDICT_isError(dictSize)) { free(dict.start); return g_nullBuffer; }
+        if (ZDICT_isError(dictSize)) { FUZ_freeDictionary(dict); return kBuffNull; }
         dict.size = requestedDictSize;
         dict.filled = dictSize;
-        return dict;   /* how to return dictSize ? */
+        return dict;
     }
 }
 
-static void FUZ_freeDictionary(buffer_t dict)
-{
-    free(dict.start);
-}
-
 /* Round trips data and updates xxh with the decompressed data produced */
 static size_t SEQ_roundTrip(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
                             XXH64_state_t* xxh, void* data, size_t size,
@@ -207,6 +221,42 @@ static size_t SEQ_generateRoundTrip(ZSTD_CCtx* cctx, ZSTD_DCtx* dctx,
     return 0;
 }
 
+static size_t getCCtxParams(ZSTD_CCtx* zc, ZSTD_parameters* savedParams)
+{
+    unsigned value;
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_windowLog, &savedParams->cParams.windowLog));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_hashLog, &savedParams->cParams.hashLog));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_chainLog, &savedParams->cParams.chainLog));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_searchLog, &savedParams->cParams.searchLog));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_minMatch, &savedParams->cParams.searchLength));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_targetLength, &savedParams->cParams.targetLength));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionStrategy, &value));
+    savedParams->cParams.strategy = value;
+
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_checksumFlag, &savedParams->fParams.checksumFlag));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_contentSizeFlag, &savedParams->fParams.contentSizeFlag));
+    CHECK_RET_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_dictIDFlag, &value));
+    savedParams->fParams.noDictIDFlag = !value;
+    return 0;
+}
+
+static U32 badParameters(ZSTD_CCtx* zc, ZSTD_parameters const savedParams)
+{
+    ZSTD_parameters params;
+    if (ZSTD_isError(getCCtxParams(zc, &params))) return 10;
+    CHECK_RET(1, params.cParams.windowLog != savedParams.cParams.windowLog, "windowLog");
+    CHECK_RET(2, params.cParams.hashLog != savedParams.cParams.hashLog, "hashLog");
+    CHECK_RET(3, params.cParams.chainLog != savedParams.cParams.chainLog, "chainLog");
+    CHECK_RET(4, params.cParams.searchLog != savedParams.cParams.searchLog, "searchLog");
+    CHECK_RET(5, params.cParams.searchLength != savedParams.cParams.searchLength, "searchLength");
+    CHECK_RET(6, params.cParams.targetLength != savedParams.cParams.targetLength, "targetLength");
+
+    CHECK_RET(7, params.fParams.checksumFlag != savedParams.fParams.checksumFlag, "checksumFlag");
+    CHECK_RET(8, params.fParams.contentSizeFlag != savedParams.fParams.contentSizeFlag, "contentSizeFlag");
+    CHECK_RET(9, params.fParams.noDictIDFlag != savedParams.fParams.noDictIDFlag, "noDictIDFlag");
+    return 0;
+}
+
 static int basicUnitTests(U32 seed, double compressibility)
 {
     size_t const CNBufferSize = COMPRESSIBLE_NOISE_LENGTH;
@@ -226,7 +276,7 @@ static int basicUnitTests(U32 seed, double compressibility)
 
     ZSTD_inBuffer  inBuff, inBuff2;
     ZSTD_outBuffer outBuff;
-    buffer_t dictionary = g_nullBuffer;
+    buffer_t dictionary = kBuffNull;
     size_t const dictSize = 128 KB;
     unsigned dictID = 0;
 
@@ -383,11 +433,12 @@ static int basicUnitTests(U32 seed, double compressibility)
         inBuff.pos = 0;
         outBuff.pos = 0;
         while (r) {   /* skippable frame */
-            size_t const inSize = FUZ_rand(&coreSeed) & 15;
-            size_t const outSize = FUZ_rand(&coreSeed) & 15;
+            size_t const inSize = (FUZ_rand(&coreSeed) & 15) + 1;
+            size_t const outSize = (FUZ_rand(&coreSeed) & 15) + 1;
             inBuff.size = inBuff.pos + inSize;
             outBuff.size = outBuff.pos + outSize;
             r = ZSTD_decompressStream(zd, &outBuff, &inBuff);
+            if (ZSTD_isError(r)) DISPLAYLEVEL(4, "ZSTD_decompressStream on skippable frame error : %s \n", ZSTD_getErrorName(r));
             if (ZSTD_isError(r)) goto _output_error;
         }
         /* normal frame */
@@ -395,14 +446,17 @@ static int basicUnitTests(U32 seed, double compressibility)
         r=1;
         while (r) {
             size_t const inSize = FUZ_rand(&coreSeed) & 15;
-            size_t const outSize = FUZ_rand(&coreSeed) & 15;
+            size_t const outSize = (FUZ_rand(&coreSeed) & 15) + (!inSize);   /* avoid having both sizes at 0 => would trigger a no_forward_progress error */
             inBuff.size = inBuff.pos + inSize;
             outBuff.size = outBuff.pos + outSize;
             r = ZSTD_decompressStream(zd, &outBuff, &inBuff);
+            if (ZSTD_isError(r)) DISPLAYLEVEL(4, "ZSTD_decompressStream error : %s \n", ZSTD_getErrorName(r));
             if (ZSTD_isError(r)) goto _output_error;
         }
     }
+    if (outBuff.pos != CNBufferSize) DISPLAYLEVEL(4, "outBuff.pos != CNBufferSize : should have regenerated same amount ! \n");
     if (outBuff.pos != CNBufferSize) goto _output_error;   /* should regenerate the same amount */
+    if (inBuff.pos != cSize) DISPLAYLEVEL(4, "inBuff.pos != cSize : should have real all input ! \n");
     if (inBuff.pos != cSize) goto _output_error;   /* should have read the entire frame */
     DISPLAYLEVEL(3, "OK \n");
 
@@ -414,6 +468,30 @@ static int basicUnitTests(U32 seed, double compressibility)
     }   }
     DISPLAYLEVEL(3, "OK \n");
 
+    /* Decompression forward progress */
+    DISPLAYLEVEL(3, "test%3i : generate error when ZSTD_decompressStream() doesn't progress : ", testNb++);
+    {   /* skippable frame */
+        size_t r = 0;
+        int decNb = 0;
+        int const maxDec = 100;
+        inBuff.src = compressedBuffer;
+        inBuff.size = cSize;
+        inBuff.pos = 0;
+
+        outBuff.dst = decodedBuffer;
+        outBuff.pos = 0;
+        outBuff.size = CNBufferSize-1;   /* 1 byte missing */
+
+        for (decNb=0; decNb<maxDec; decNb++) {
+            if (r==0) ZSTD_initDStream_usingDict(zd, CNBuffer, dictSize);
+            r = ZSTD_decompressStream(zd, &outBuff, &inBuff);
+            if (ZSTD_isError(r)) break;
+        }
+        if (!ZSTD_isError(r)) DISPLAYLEVEL(4, "ZSTD_decompressStream should have triggered a no_forward_progress error \n");
+        if (!ZSTD_isError(r)) goto _output_error;   /* should have triggered no_forward_progress error */
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     /* _srcSize compression test */
     DISPLAYLEVEL(3, "test%3i : compress_srcSize %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
     ZSTD_initCStream_srcSize(zc, 1, CNBufferSize);
@@ -460,6 +538,21 @@ static int basicUnitTests(U32 seed, double compressibility)
         DISPLAYLEVEL(3, "OK (error detected : %s) \n", ZSTD_getErrorName(r));
     }
 
+    DISPLAYLEVEL(3, "test%3i : wrong srcSize !contentSizeFlag : %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH-1);
+    {   ZSTD_parameters params = ZSTD_getParams(1, CNBufferSize, 0);
+        params.fParams.contentSizeFlag = 0;
+        CHECK_Z(ZSTD_initCStream_advanced(zc, NULL, 0, params, CNBufferSize - MIN(CNBufferSize, 200 KB)));
+        outBuff.dst = (char*)compressedBuffer;
+        outBuff.size = compressedBufferSize;
+        outBuff.pos = 0;
+        inBuff.src = CNBuffer;
+        inBuff.size = CNBufferSize;
+        inBuff.pos = 0;
+        {   size_t const r = ZSTD_compressStream(zc, &outBuff, &inBuff);
+            if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error;    /* must fail : wrong srcSize */
+            DISPLAYLEVEL(3, "OK (error detected : %s) \n", ZSTD_getErrorName(r));
+    }   }
+
     /* Complex context re-use scenario */
     DISPLAYLEVEL(3, "test%3i : context re-use : ", testNb++);
     ZSTD_freeCStream(zc);
@@ -507,7 +600,6 @@ static int basicUnitTests(U32 seed, double compressibility)
         size_t const initError = ZSTD_initCStream_usingCDict(zc, cdict);
         DISPLAYLEVEL(5, "ZSTD_initCStream_usingCDict result : %u ", (U32)initError);
         if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
         outBuff.dst = compressedBuffer;
         outBuff.size = compressedBufferSize;
         outBuff.pos = 0;
@@ -591,6 +683,8 @@ static int basicUnitTests(U32 seed, double compressibility)
             for (size = 512; size <= maxSize; size <<= 1) {
                 U64 const crcOrig = XXH64(CNBuffer, size, 0);
                 ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+                ZSTD_parameters savedParams;
+                getCCtxParams(cctx, &savedParams);
                 outBuff.dst = compressedBuffer;
                 outBuff.size = compressedBufferSize;
                 outBuff.pos = 0;
@@ -599,6 +693,7 @@ static int basicUnitTests(U32 seed, double compressibility)
                 inBuff.pos = 0;
                 CHECK_Z(ZSTD_CCtx_refCDict(cctx, cdict));
                 CHECK_Z(ZSTD_compress_generic(cctx, &outBuff, &inBuff, ZSTD_e_end));
+                CHECK(badParameters(cctx, savedParams), "Bad CCtx params");
                 if (inBuff.pos != inBuff.size) goto _output_error;
                 {   ZSTD_outBuffer decOut = {decodedBuffer, size, 0};
                     ZSTD_inBuffer decIn = {outBuff.dst, outBuff.pos, 0};
@@ -622,7 +717,6 @@ static int basicUnitTests(U32 seed, double compressibility)
         ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem);
         size_t const initError = ZSTD_initCStream_usingCDict_advanced(zc, cdict, fParams, CNBufferSize);
         if (ZSTD_isError(initError)) goto _output_error;
-        cSize = 0;
         outBuff.dst = compressedBuffer;
         outBuff.size = compressedBufferSize;
         outBuff.pos = 0;
@@ -748,7 +842,12 @@ static int basicUnitTests(U32 seed, double compressibility)
     /* Basic multithreading compression test */
     DISPLAYLEVEL(3, "test%3i : compress %u bytes with multiple threads : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
     {   ZSTD_parameters const params = ZSTD_getParams(1, 0, 0);
+        unsigned jobSize;
+        CHECK_Z( ZSTDMT_getMTCtxParameter(mtctx, ZSTDMT_p_jobSize, &jobSize));
+        CHECK(jobSize != 0, "job size non-zero");
         CHECK_Z( ZSTDMT_initCStream_advanced(mtctx, CNBuffer, dictSize, params, CNBufferSize) );
+        CHECK_Z( ZSTDMT_getMTCtxParameter(mtctx, ZSTDMT_p_jobSize, &jobSize));
+        CHECK(jobSize != 0, "job size non-zero");
     }
     outBuff.dst = compressedBuffer;
     outBuff.size = compressedBufferSize;
@@ -868,6 +967,26 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_srcSize sets requestedParams : ", testNb++);
+    {   unsigned level;
+        CHECK_Z(ZSTD_initCStream_srcSize(zc, 11, ZSTD_CONTENTSIZE_UNKNOWN));
+        CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level));
+        CHECK(level != 11, "Compression level does not match");
+        ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN);
+        CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level));
+        CHECK(level != 11, "Compression level does not match");
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced sets requestedParams : ", testNb++);
+    {   ZSTD_parameters const params = ZSTD_getParams(9, 0, 0);
+        CHECK_Z(ZSTD_initCStream_advanced(zc, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN));
+        CHECK(badParameters(zc, params), "Compression parameters do not match");
+        ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN);
+        CHECK(badParameters(zc, params), "Compression parameters do not match");
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     /* Overlen overwriting window data bug */
     DISPLAYLEVEL(3, "test%3i : wildcopy doesn't overwrite potential match data : ", testNb++);
     {   /* This test has a window size of 1024 bytes and consists of 3 blocks:
@@ -901,6 +1020,97 @@ static int basicUnitTests(U32 seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : dictionary + uncompressible block + reusing tables checks offset table validity: ", testNb++);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(
+            dictionary.start, dictionary.filled,
+            ZSTD_dlm_byRef, ZSTD_dct_fullDict,
+            ZSTD_getCParams(3, 0, dictionary.filled),
+            ZSTD_defaultCMem);
+        const size_t inbufsize = 2 * 128 * 1024; /* 2 blocks */
+        const size_t outbufsize = ZSTD_compressBound(inbufsize);
+        size_t inbufpos = 0;
+        size_t cursegmentlen;
+        BYTE *inbuf = (BYTE *)malloc(inbufsize);
+        BYTE *outbuf = (BYTE *)malloc(outbufsize);
+        BYTE *checkbuf = (BYTE *)malloc(inbufsize);
+        size_t ret;
+
+        CHECK(cdict == NULL, "failed to alloc cdict");
+        CHECK(inbuf == NULL, "failed to alloc input buffer");
+
+        /* first block is uncompressible */
+        cursegmentlen = 128 * 1024;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0., 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* second block is compressible */
+        cursegmentlen = 128 * 1024 - 256;
+        RDG_genBuffer(inbuf + inbufpos, cursegmentlen, 0.05, 0., seed);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 256, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        /* and includes a very long backref */
+        cursegmentlen = 128;
+        memcpy(inbuf + inbufpos, dictionary.start + 128, cursegmentlen);
+        inbufpos += cursegmentlen;
+
+        ret = ZSTD_compress_usingCDict(zc, outbuf, outbufsize, inbuf, inbufpos, cdict);
+        CHECK_Z(ret);
+
+        ret = ZSTD_decompress_usingDict(zd, checkbuf, inbufsize, outbuf, ret, dictionary.start, dictionary.filled);
+        CHECK_Z(ret);
+
+        CHECK(memcmp(inbuf, checkbuf, inbufpos), "start and finish buffers don't match");
+
+        ZSTD_freeCDict(cdict);
+        free(inbuf);
+        free(outbuf);
+        free(checkbuf);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
+    DISPLAYLEVEL(3, "test%3i : dictionary + small blocks + reusing tables checks offset table validity: ", testNb++);
+    {   ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(
+            dictionary.start, dictionary.filled,
+            ZSTD_dlm_byRef, ZSTD_dct_fullDict,
+            ZSTD_getCParams(3, 0, dictionary.filled),
+            ZSTD_defaultCMem);
+        ZSTD_outBuffer out = {compressedBuffer, compressedBufferSize, 0};
+        int remainingInput = 256 * 1024;
+        int offset;
+
+        ZSTD_CCtx_reset(zc);
+        CHECK_Z(ZSTD_CCtx_resetParameters(zc));
+        CHECK_Z(ZSTD_CCtx_refCDict(zc, cdict));
+        CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_p_checksumFlag, 1));
+        /* Write a bunch of 6 byte blocks */
+        while (remainingInput > 0) {
+          char testBuffer[6] = "\xAA\xAA\xAA\xAA\xAA\xAA";
+          const size_t kSmallBlockSize = sizeof(testBuffer);
+          ZSTD_inBuffer in = {testBuffer, kSmallBlockSize, 0};
+
+          CHECK_Z(ZSTD_compress_generic(zc, &out, &in, ZSTD_e_flush));
+          CHECK(in.pos != in.size, "input not fully consumed");
+          remainingInput -= kSmallBlockSize;
+        }
+        /* Write several very long offset matches into the dictionary */
+        for (offset = 1024; offset >= 0; offset -= 128) {
+          ZSTD_inBuffer in = {dictionary.start + offset, 128, 0};
+          ZSTD_EndDirective flush = offset > 0 ? ZSTD_e_continue : ZSTD_e_end;
+          CHECK_Z(ZSTD_compress_generic(zc, &out, &in, flush));
+          CHECK(in.pos != in.size, "input not fully consumed");
+        }
+        /* Ensure decompression works */
+        CHECK_Z(ZSTD_decompress_usingDict(zd, decodedBuffer, CNBufferSize, out.dst, out.pos, dictionary.start, dictionary.filled));
+
+        ZSTD_freeCDict(cdict);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
 _end:
     FUZ_freeDictionary(dictionary);
     ZSTD_freeCStream(zc);
@@ -1216,8 +1426,9 @@ _output_error:
 }
 
 
-/* Multi-threading version of fuzzer Tests */
-static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double compressibility, int bigTests)
+/* fuzzing ZSTDMT_* interface */
+static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest,
+                          double compressibility, int bigTests)
 {
     const U32 maxSrcLog = bigTests ? 24 : 22;
     static const U32 maxSampleLog = 19;
@@ -1491,7 +1702,7 @@ _output_error:
  *  Otherwise, sets the param in zc. */
 static size_t setCCtxParameter(ZSTD_CCtx* zc, ZSTD_CCtx_params* cctxParams,
                                ZSTD_cParameter param, unsigned value,
-                               U32 useOpaqueAPI)
+                               int useOpaqueAPI)
 {
     if (useOpaqueAPI) {
         return ZSTD_CCtxParam_setParameter(cctxParams, param, value);
@@ -1501,7 +1712,8 @@ static size_t setCCtxParameter(ZSTD_CCtx* zc, ZSTD_CCtx_params* cctxParams,
 }
 
 /* Tests for ZSTD_compress_generic() API */
-static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double compressibility, int bigTests, U32 const useOpaqueAPI)
+static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest,
+                              double compressibility, int bigTests)
 {
     U32 const maxSrcLog = bigTests ? 24 : 22;
     static const U32 maxSampleLog = 19;
@@ -1554,12 +1766,14 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
     /* test loop */
     for ( ; (testNb <= nbTests) || (UTIL_clockSpanMicro(startClock) < g_clockTime) ; testNb++ ) {
         U32 lseed;
+        int opaqueAPI;
         const BYTE* srcBuffer;
         size_t totalTestSize, totalGenSize, cSize;
         XXH64_state_t xxhState;
         U64 crcOrig;
         U32 resetAllowed = 1;
         size_t maxTestSize;
+        ZSTD_parameters savedParams;
 
         /* init */
         if (nbTests >= testNb) { DISPLAYUPDATE(2, "\r%6u/%6u    ", testNb, nbTests); }
@@ -1567,6 +1781,7 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
         FUZ_rand(&coreSeed);
         lseed = coreSeed ^ prime32;
         DISPLAYLEVEL(5, " ***  Test %u  *** \n", testNb);
+        opaqueAPI = FUZ_rand(&lseed) & 1;
 
         /* states full reset (deliberately not synchronized) */
         /* some issues can only happen when reusing states */
@@ -1574,13 +1789,13 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
             DISPLAYLEVEL(5, "Creating new context \n");
             ZSTD_freeCCtx(zc);
             zc = ZSTD_createCCtx();
-            CHECK(zc==NULL, "ZSTD_createCCtx allocation error");
-            resetAllowed=0;
+            CHECK(zc == NULL, "ZSTD_createCCtx allocation error");
+            resetAllowed = 0;
         }
         if ((FUZ_rand(&lseed) & 0xFF) == 132) {
             ZSTD_freeDStream(zd);
             zd = ZSTD_createDStream();
-            CHECK(zd==NULL, "ZSTD_createDStream allocation error");
+            CHECK(zd == NULL, "ZSTD_createDStream allocation error");
             ZSTD_initDStream_usingDict(zd, NULL, 0);  /* ensure at least one init */
         }
 
@@ -1602,11 +1817,14 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
         /* compression init */
         CHECK_Z( ZSTD_CCtx_loadDictionary(zc, NULL, 0) );   /* cancel previous dict /*/
         if ((FUZ_rand(&lseed)&1) /* at beginning, to keep same nb of rand */
-            && oldTestLog /* at least one test happened */ && resetAllowed) {
+          && oldTestLog   /* at least one test happened */
+          && resetAllowed) {
+            /* just set a compression level */
             maxTestSize = FUZ_randomLength(&lseed, oldTestLog+2);
             if (maxTestSize >= srcBufferSize) maxTestSize = srcBufferSize-1;
             {   int const compressionLevel = (FUZ_rand(&lseed) % 5) + 1;
-                CHECK_Z (setCCtxParameter(zc, cctxParams, ZSTD_p_compressionLevel, compressionLevel, useOpaqueAPI) );
+                DISPLAYLEVEL(5, "t%u : compression level : %i \n", testNb, compressionLevel);
+                CHECK_Z (setCCtxParameter(zc, cctxParams, ZSTD_p_compressionLevel, compressionLevel, opaqueAPI) );
             }
         } else {
             U32 const testLog = FUZ_rand(&lseed) % maxSrcLog;
@@ -1628,7 +1846,10 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
             }
             {   U64 const pledgedSrcSize = (FUZ_rand(&lseed) & 3) ? ZSTD_CONTENTSIZE_UNKNOWN : maxTestSize;
                 ZSTD_compressionParameters cParams = ZSTD_getCParams(cLevel, pledgedSrcSize, dictSize);
-                static const U32 windowLogMax = 24;
+                const U32 windowLogMax = bigTests ? 24 : 20;
+                const U32 searchLogMax = bigTests ? 15 : 13;
+                if (dictSize)
+                    DISPLAYLEVEL(5, "t%u: with dictionary of size : %zu \n", testNb, dictSize);
 
                 /* mess with compression parameters */
                 cParams.windowLog += (FUZ_rand(&lseed) & 3) - 1;
@@ -1636,68 +1857,70 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                 cParams.hashLog += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.chainLog += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.searchLog += (FUZ_rand(&lseed) & 3) - 1;
+                cParams.searchLog = MIN(searchLogMax, cParams.searchLog);
                 cParams.searchLength += (FUZ_rand(&lseed) & 3) - 1;
                 cParams.targetLength = (U32)((cParams.targetLength + 1 ) * (0.5 + ((double)(FUZ_rand(&lseed) & 127) / 128)));
-                cParams = ZSTD_adjustCParams(cParams, 0, 0);
+                cParams = ZSTD_adjustCParams(cParams, pledgedSrcSize, dictSize);
 
                 if (FUZ_rand(&lseed) & 1) {
                     DISPLAYLEVEL(5, "t%u: windowLog : %u \n", testNb, cParams.windowLog);
-                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_windowLog, cParams.windowLog, useOpaqueAPI) );
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_windowLog, cParams.windowLog, opaqueAPI) );
                     assert(cParams.windowLog >= ZSTD_WINDOWLOG_MIN);   /* guaranteed by ZSTD_adjustCParams() */
                     windowLogMalus = (cParams.windowLog - ZSTD_WINDOWLOG_MIN) / 5;
                 }
                 if (FUZ_rand(&lseed) & 1) {
                     DISPLAYLEVEL(5, "t%u: hashLog : %u \n", testNb, cParams.hashLog);
-                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_hashLog, cParams.hashLog, useOpaqueAPI) );
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_hashLog, cParams.hashLog, opaqueAPI) );
                 }
                 if (FUZ_rand(&lseed) & 1) {
                     DISPLAYLEVEL(5, "t%u: chainLog : %u \n", testNb, cParams.chainLog);
-                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_chainLog, cParams.chainLog, useOpaqueAPI) );
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_chainLog, cParams.chainLog, opaqueAPI) );
                 }
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_searchLog, cParams.searchLog, useOpaqueAPI) );
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_minMatch, cParams.searchLength, useOpaqueAPI) );
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_targetLength, cParams.targetLength, useOpaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_searchLog, cParams.searchLog, opaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_minMatch, cParams.searchLength, opaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_targetLength, cParams.targetLength, opaqueAPI) );
 
                 /* mess with long distance matching parameters */
                 if (bigTests) {
-                    if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_enableLongDistanceMatching, FUZ_rand(&lseed) & 63, useOpaqueAPI) );
-                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmHashLog, FUZ_randomClampedLength(&lseed, ZSTD_HASHLOG_MIN, 23), useOpaqueAPI) );
-                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmMinMatch, FUZ_randomClampedLength(&lseed, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX), useOpaqueAPI) );
-                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmBucketSizeLog, FUZ_randomClampedLength(&lseed, 0, ZSTD_LDM_BUCKETSIZELOG_MAX), useOpaqueAPI) );
-                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmHashEveryLog, FUZ_randomClampedLength(&lseed, 0, ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN), useOpaqueAPI) );
+                    if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_enableLongDistanceMatching, FUZ_rand(&lseed) & 63, opaqueAPI) );
+                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmHashLog, FUZ_randomClampedLength(&lseed, ZSTD_HASHLOG_MIN, 23), opaqueAPI) );
+                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmMinMatch, FUZ_randomClampedLength(&lseed, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX), opaqueAPI) );
+                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmBucketSizeLog, FUZ_randomClampedLength(&lseed, 0, ZSTD_LDM_BUCKETSIZELOG_MAX), opaqueAPI) );
+                    if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_ldmHashEveryLog, FUZ_randomClampedLength(&lseed, 0, ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN), opaqueAPI) );
                 }
 
                 /* mess with frame parameters */
                 if (FUZ_rand(&lseed) & 1) {
                     U32 const checksumFlag = FUZ_rand(&lseed) & 1;
                     DISPLAYLEVEL(5, "t%u: frame checksum : %u \n", testNb, checksumFlag);
-                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_checksumFlag, checksumFlag, useOpaqueAPI) );
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_checksumFlag, checksumFlag, opaqueAPI) );
                 }
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_dictIDFlag, FUZ_rand(&lseed) & 1, useOpaqueAPI) );
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_contentSizeFlag, FUZ_rand(&lseed) & 1, useOpaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_dictIDFlag, FUZ_rand(&lseed) & 1, opaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_contentSizeFlag, FUZ_rand(&lseed) & 1, opaqueAPI) );
                 if (FUZ_rand(&lseed) & 1) {
                     DISPLAYLEVEL(5, "t%u: pledgedSrcSize : %u \n", testNb, (U32)pledgedSrcSize);
                     CHECK_Z( ZSTD_CCtx_setPledgedSrcSize(zc, pledgedSrcSize) );
                 }
 
-                /* multi-threading parameters */
-                {   U32 const nbThreadsCandidate = (FUZ_rand(&lseed) & 4) + 1;
+                /* multi-threading parameters. Only adjust ocassionally for small tests. */
+                if (bigTests || (FUZ_rand(&lseed) & 0xF) == 0xF) {
+                    U32 const nbThreadsCandidate = (FUZ_rand(&lseed) & 4) + 1;
                     U32 const nbThreadsAdjusted = (windowLogMalus < nbThreadsCandidate) ? nbThreadsCandidate - windowLogMalus : 1;
                     U32 const nbThreads = MIN(nbThreadsAdjusted, nbThreadsMax);
                     DISPLAYLEVEL(5, "t%u: nbThreads : %u \n", testNb, nbThreads);
-                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_nbWorkers, nbThreads, useOpaqueAPI) );
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_nbWorkers, nbThreads, opaqueAPI) );
                     if (nbThreads > 1) {
                         U32 const jobLog = FUZ_rand(&lseed) % (testLog+1);
-                        CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_overlapSizeLog, FUZ_rand(&lseed) % 10, useOpaqueAPI) );
-                        CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_jobSize, (U32)FUZ_rLogLength(&lseed, jobLog), useOpaqueAPI) );
+                        CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_overlapSizeLog, FUZ_rand(&lseed) % 10, opaqueAPI) );
+                        CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_jobSize, (U32)FUZ_rLogLength(&lseed, jobLog), opaqueAPI) );
                     }
                 }
 
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_forceMaxWindow, FUZ_rand(&lseed) & 1, useOpaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_forceMaxWindow, FUZ_rand(&lseed) & 1, opaqueAPI) );
 
                 /* Apply parameters */
-                if (useOpaqueAPI) {
-                    DISPLAYLEVEL(6," t%u: applying CCtxParams \n", testNb);
+                if (opaqueAPI) {
+                    DISPLAYLEVEL(5, "t%u: applying CCtxParams \n", testNb);
                     CHECK_Z (ZSTD_CCtx_setParametersUsingCCtxParams(zc, cctxParams) );
                 }
 
@@ -1709,7 +1932,7 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                     }
                     if (dict && dictSize) {
                         /* test that compression parameters are rejected (correctly) after loading a non-NULL dictionary */
-                        if (useOpaqueAPI) {
+                        if (opaqueAPI) {
                             size_t const setError = ZSTD_CCtx_setParametersUsingCCtxParams(zc, cctxParams);
                             CHECK(!ZSTD_isError(setError), "ZSTD_CCtx_setParametersUsingCCtxParams should have failed");
                         } else {
@@ -1722,6 +1945,8 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                 }
         }   }
 
+        CHECK_Z(getCCtxParams(zc, &savedParams));
+
         /* multi-segments compression test */
         XXH64_reset(&xxhState, 0);
         {   ZSTD_outBuffer outBuff = { cBuffer, cBufferSize, 0 } ;
@@ -1761,15 +1986,18 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
             }   }
             crcOrig = XXH64_digest(&xxhState);
             cSize = outBuff.pos;
-            DISPLAYLEVEL(5, "Frame completed : %u bytes \n", (U32)cSize);
+            DISPLAYLEVEL(5, "Frame completed : %zu bytes \n", cSize);
         }
 
+        CHECK(badParameters(zc, savedParams), "CCtx params are wrong");
+
         /* multi - fragments decompression test */
         if (!dictSize /* don't reset if dictionary : could be different */ && (FUZ_rand(&lseed) & 1)) {
             DISPLAYLEVEL(5, "resetting DCtx (dict:%08X) \n", (U32)(size_t)dict);
             CHECK_Z( ZSTD_resetDStream(zd) );
         } else {
-            DISPLAYLEVEL(5, "using dict of size %u \n", (U32)dictSize);
+            if (dictSize)
+                DISPLAYLEVEL(5, "using dictionary of size %zu \n", dictSize);
             CHECK_Z( ZSTD_initDStream_usingDict(zd, dict, dictSize) );
         }
         {   size_t decompressionResult = 1;
@@ -1853,7 +2081,7 @@ _output_error:
 /*-*******************************************************
 *  Command line
 *********************************************************/
-int FUZ_usage(const char* programName)
+static int FUZ_usage(const char* programName)
 {
     DISPLAY( "Usage :\n");
     DISPLAY( "      %s [args]\n", programName);
@@ -1883,7 +2111,6 @@ int main(int argc, const char** argv)
     int bigTests = (sizeof(size_t) == 8);
     e_api selected_api = simple_api;
     const char* const programName = argv[0];
-    U32 useOpaqueAPI = 0;
     int argNb;
 
     /* Check command line */
@@ -1896,7 +2123,6 @@ int main(int argc, const char** argv)
 
             if (!strcmp(argument, "--mt")) { selected_api=mt_api; testNb += !testNb; continue; }
             if (!strcmp(argument, "--newapi")) { selected_api=advanced_api; testNb += !testNb; continue; }
-            if (!strcmp(argument, "--opaqueapi")) { selected_api=advanced_api; testNb += !testNb; useOpaqueAPI = 1; continue; }
             if (!strcmp(argument, "--no-big-tests")) { bigTests=0; continue; }
 
             argument++;
@@ -2012,7 +2238,7 @@ int main(int argc, const char** argv)
             result = fuzzerTests_MT(seed, nbTests, testNb, ((double)proba) / 100, bigTests);
             break;
         case advanced_api :
-            result = fuzzerTests_newAPI(seed, nbTests, testNb, ((double)proba) / 100, bigTests, useOpaqueAPI);
+            result = fuzzerTests_newAPI(seed, nbTests, testNb, ((double)proba) / 100, bigTests);
             break;
         default :
             assert(0);   /* impossible */
diff --git a/zlibWrapper/examples/minigzip.c b/zlibWrapper/examples/minigzip.c
index 521d047117c5..f67be09564f2 100644
--- a/zlibWrapper/examples/minigzip.c
+++ b/zlibWrapper/examples/minigzip.c
@@ -18,6 +18,8 @@
 
 /* @(#) $Id$ */
 
+#define _POSIX_SOURCE /* fileno */
+
 #include "zstd_zlibwrapper.h"
 #include <stdio.h>
 
@@ -470,12 +472,8 @@ void file_compress(file, mode)
         exit(1);
     }
 
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-    snprintf(outfile, sizeof(outfile), "%s%s", file, GZ_SUFFIX);
-#else
     strcpy(outfile, file);
     strcat(outfile, GZ_SUFFIX);
-#endif
 
     in = fopen(file, "rb");
     if (in == NULL) {
@@ -510,11 +508,7 @@ void file_uncompress(file)
         exit(1);
     }
 
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-    snprintf(buf, sizeof(buf), "%s", file);
-#else
     strcpy(buf, file);
-#endif
 
     if (len > SUFFIX_LEN && strcmp(file+len-SUFFIX_LEN, GZ_SUFFIX) == 0) {
         infile = file;
@@ -523,11 +517,7 @@ void file_uncompress(file)
     } else {
         outfile = file;
         infile = buf;
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-        snprintf(buf + len, sizeof(buf) - len, "%s", GZ_SUFFIX);
-#else
         strcat(infile, GZ_SUFFIX);
-#endif
     }
     in = gzopen(infile, "rb");
     if (in == NULL) {
@@ -565,11 +555,7 @@ int main(argc, argv)
     gzFile file;
     char *bname, outmode[20];
 
-#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
-    snprintf(outmode, sizeof(outmode), "%s", "wb6 ");
-#else
     strcpy(outmode, "wb6 ");
-#endif
 
     prog = argv[0];
     bname = strrchr(argv[0], '/');
diff --git a/zlibWrapper/examples/zwrapbench.c b/zlibWrapper/examples/zwrapbench.c
index a4dfbb6e8c47..d2d6073f9c0e 100644
--- a/zlibWrapper/examples/zwrapbench.c
+++ b/zlibWrapper/examples/zwrapbench.c
@@ -573,10 +573,10 @@ static size_t BMK_findMaxMem(U64 requiredMem)
     do {
         testmem = (BYTE*)malloc((size_t)requiredMem);
         requiredMem -= step;
-    } while (!testmem);
+    } while (!testmem && requiredMem);   /* do not allocate zero bytes */
 
     free(testmem);
-    return (size_t)(requiredMem);
+    return (size_t)(requiredMem+1);  /* avoid zero */
 }
 
 static void BMK_benchCLevel(void* srcBuffer, size_t benchedSize,
@@ -734,7 +734,7 @@ static void BMK_benchFileTable(const char** fileNamesTable, unsigned nbFiles,
     if ((U64)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
     if (benchedSize < totalSizeToLoad)
         DISPLAY("Not enough memory; testing %u MB only...\n", (U32)(benchedSize >> 20));
-    srcBuffer = malloc(benchedSize);
+    srcBuffer = malloc(benchedSize + !benchedSize);
     if (!srcBuffer) EXM_THROW(12, "not enough memory");
 
     /* Load input buffer */
diff --git a/zlibWrapper/gzguts.h b/zlibWrapper/gzguts.h
index 84651b88d389..05bf4d9f4c31 100644
--- a/zlibWrapper/gzguts.h
+++ b/zlibWrapper/gzguts.h
@@ -1,5 +1,5 @@
 /* gzguts.h contains minimal changes required to be compiled with zlibWrapper:
- * - #include "zlib.h" was changed to #include "zstd_zlibwrapper.h"        
+ * - #include "zlib.h" was changed to #include "zstd_zlibwrapper.h"
  * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */
 
 /* gzguts.h -- zlib internal header definitions for gz* operations
@@ -44,7 +44,7 @@
 #  include <io.h>
 #endif
 
-#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(_WIN32)
 #  define WIDECHAR
 #endif
 
diff --git a/zlibWrapper/gzlib.c b/zlibWrapper/gzlib.c
index 8235cff4fda1..3070dd8b4975 100644
--- a/zlibWrapper/gzlib.c
+++ b/zlibWrapper/gzlib.c
@@ -111,7 +111,7 @@ local gzFile gz_open(path, fd, mode)
         return NULL;
 
     /* allocate gzFile structure to return */
-    state = (gz_statep)(gz_state*)malloc(sizeof(gz_state));
+    state.state = (gz_state*)malloc(sizeof(gz_state));
     if (state.state == NULL)
         return NULL;
     state.state->size = 0;            /* no buffers allocated yet */
@@ -266,7 +266,7 @@ local gzFile gz_open(path, fd, mode)
     gz_reset(state);
 
     /* return stream */
-    return (gzFile)state.file;
+    return state.file;
 }
 
 /* -- see zlib.h -- */
diff --git a/zlibWrapper/gzwrite.c b/zlibWrapper/gzwrite.c
index d1250b90084d..21d5f84727aa 100644
--- a/zlibWrapper/gzwrite.c
+++ b/zlibWrapper/gzwrite.c
@@ -6,6 +6,8 @@
  * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
  */
 
+#include <assert.h>
+
 #include "gzguts.h"
 
 /* Local functions */
@@ -24,7 +26,7 @@ local int gz_init(state)
     z_streamp strm = &(state.state->strm);
 
     /* allocate input buffer (double size for gzprintf) */
-    state.state->in = (unsigned char *)malloc(state.state->want << 1);
+    state.state->in = (unsigned char*)malloc(state.state->want << 1);
     if (state.state->in == NULL) {
         gz_error(state, Z_MEM_ERROR, "out of memory");
         return -1;
@@ -33,7 +35,7 @@ local int gz_init(state)
     /* only need output buffer and deflate state if compressing */
     if (!state.state->direct) {
         /* allocate output buffer */
-        state.state->out = (unsigned char *)malloc(state.state->want);
+        state.state->out = (unsigned char*)malloc(state.state->want);
         if (state.state->out == NULL) {
             free(state.state->in);
             gz_error(state, Z_MEM_ERROR, "out of memory");
@@ -284,6 +286,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
     gz_statep state;
 
     /* get internal structure */
+    assert(size != 0);
     if (file == NULL)
         return 0;
     state = (gz_statep)file;
@@ -294,7 +297,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
 
     /* compute bytes to read -- error on overflow */
     len = nitems * size;
-    if (size && len / size != nitems) {
+    if (size && (len / size != nitems)) {
         gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
         return 0;
     }