Update to Zstandard 1.3.4

Includes our local patch to conditionalize use of __builtin_clz(ll) on Clang's __has_builtin() (which is just defined to false when building with GCC). The issue is tracked upstream at https://github.com/facebook/zstd/pull/884 . Otherwise, these are vanilla Zstandard 1.3.4 files. Reported by: allanjude, Yann Collet Sponsored by: Dell EMC Isilon
2018-03-26 23:54:59 +00:00 · 2018-03-26 23:54:59 +00:00 · 19fcbaf142
commit 19fcbaf142
parent 52f72944b8
82 changed files with 7465 additions and 4517 deletions
--- a/sys/contrib/zstd/Makefile
+++ b/sys/contrib/zstd/Makefile
@ -27,7 +27,7 @@ endif
 default: lib-release zstd-release

 .PHONY: all
-all: | allmost examples manual
+all: | allmost examples manual contrib

 .PHONY: allmost
 allmost: allzstd
@ -72,14 +72,18 @@ zstdmt:
 zlibwrapper:
 	$(MAKE) -C $(ZWRAPDIR) test

+.PHONY: test
+test:
+	$(MAKE) -C $(PRGDIR) allVariants MOREFLAGS+="-g -DZSTD_DEBUG=1"
+	$(MAKE) -C $(TESTDIR) $@
+
+.PHONY: shortest
+shortest:
+	$(MAKE) -C $(TESTDIR) $@
+
 .PHONY: check
 check: shortest

-.PHONY: test shortest
-test shortest:
-	$(MAKE) -C $(PRGDIR) allVariants MOREFLAGS="-g -DZSTD_DEBUG=1"
-	$(MAKE) -C $(TESTDIR) $@
-
 .PHONY: examples
 examples:
 	CPPFLAGS=-I../lib LDFLAGS=-L../lib $(MAKE) -C examples/ all
@ -88,6 +92,12 @@ examples:
 manual:
 	$(MAKE) -C contrib/gen_html $@

+.PHONY: contrib
+contrib: lib
+	$(MAKE) -C contrib/pzstd all
+	$(MAKE) -C contrib/seekable_format/examples all
+	$(MAKE) -C contrib/adaptive-compression all
+
 .PHONY: cleanTabs
 cleanTabs:
 	cd contrib; ./cleanTabs
@ -100,6 +110,9 @@ clean:
 	@$(MAKE) -C $(ZWRAPDIR) $@ > $(VOID)
 	@$(MAKE) -C examples/ $@ > $(VOID)
 	@$(MAKE) -C contrib/gen_html $@ > $(VOID)
+	@$(MAKE) -C contrib/pzstd $@ > $(VOID)
+	@$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
+	@$(MAKE) -C contrib/adaptive-compression $@ > $(VOID)
 	@$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
 	@$(RM) -r lz4
 	@echo Cleaning completed
@ -231,31 +244,31 @@ msanregressiontest:
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63303

 usan: clean
-	$(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow -fsanitize=undefined"
+	$(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow -fsanitize=undefined -Werror"

 asan: clean
-	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=address"
+	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=address -Werror"

 asan-%: clean
-	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=address" $(MAKE) -C $(TESTDIR) $*
+	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=address -Werror" $(MAKE) -C $(TESTDIR) $*

 msan: clean
-	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=memory -fno-omit-frame-pointer" HAVE_LZMA=0   # datagen.c fails this test for no obvious reason
+	$(MAKE) test CC=clang MOREFLAGS="-g -fsanitize=memory -fno-omit-frame-pointer -Werror" HAVE_LZMA=0   # datagen.c fails this test for no obvious reason

 msan-%: clean
-	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=memory -fno-omit-frame-pointer" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) HAVE_LZMA=0 $*
+	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=memory -fno-omit-frame-pointer -Werror" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) HAVE_LZMA=0 $*

 asan32: clean
 	$(MAKE) -C $(TESTDIR) test32 CC=clang MOREFLAGS="-g -fsanitize=address"

 uasan: clean
-	$(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow -fsanitize=address,undefined"
+	$(MAKE) test CC=clang MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow -fsanitize=address,undefined -Werror"

 uasan-%: clean
-	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow -fsanitize=address,undefined" $(MAKE) -C $(TESTDIR) $*
+	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow -fsanitize=address,undefined -Werror" $(MAKE) -C $(TESTDIR) $*

 tsan-%: clean
-	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=thread" $(MAKE) -C $(TESTDIR) $* FUZZER_FLAGS=--no-big-tests
+	LDFLAGS=-fuse-ld=gold MOREFLAGS="-g -fno-sanitize-recover=all -fsanitize=thread -Werror" $(MAKE) -C $(TESTDIR) $* FUZZER_FLAGS=--no-big-tests

 apt-install:
 	sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install $(APT_PACKAGES)
@ -279,6 +292,9 @@ libc6install:
 gcc6install: apt-add-repo
 	APT_PACKAGES="libc6-dev-i386 gcc-multilib gcc-6 gcc-6-multilib" $(MAKE) apt-install

+gcc7install: apt-add-repo
+	APT_PACKAGES="libc6-dev-i386 gcc-multilib gcc-7 gcc-7-multilib" $(MAKE) apt-install
+
 gpp6install: apt-add-repo
 	APT_PACKAGES="libc6-dev-i386 g++-multilib gcc-6 g++-6 g++-6-multilib" $(MAKE) apt-install

--- a/sys/contrib/zstd/NEWS
+++ b/sys/contrib/zstd/NEWS
@ -1,5 +1,23 @@
+v1.3.4
+perf: faster speed (especially decoding speed) on recent cpus (haswell+)
+perf: much better performance associating --long with multi-threading, by @terrelln
+perf: better compression at levels 13-15
+cli : asynchronous compression by default, for faster experience (use --single-thread for former behavior)
+cli : smoother status report in multi-threading mode
+cli : added command --fast=#, for faster compression modes
+cli : fix crash when not overwriting existing files, by Pádraig Brady (@pixelb)
+api : `nbThreads` becomes `nbWorkers` : 1 triggers asynchronous mode
+api : compression levels can be negative, for even more speed
+api : ZSTD_getFrameProgression() : get precise progress status of ZSTDMT anytime
+api : ZSTDMT can accept new compression parameters during compression
+api : implemented all advanced dictionary decompression prototypes
+build: improved meson recipe, by Shawn Landden (@shawnl)
+build: VS2017 scripts, by @HaydnTrigg
+misc: all /contrib projects fixed
+misc: added /contrib/docker script by @gyscos
+
 v1.3.3
-perf: faster zstd_opt strategy (levels 17-19)
+perf: faster zstd_opt strategy (levels 16-19)
 fix : bug #944 : multithreading with shared ditionary and large data, reported by @gsliepen
 cli : fix : content size written in header by default
 cli : fix : improved LZ4 format support, by @felixhandte
--- a/sys/contrib/zstd/README.md
+++ b/sys/contrib/zstd/README.md
@ -1,4 +1,4 @@
-<p align="center"><img src="https://raw.githubusercontent.com/facebook/zstd/readme/doc/images/zstd_logo86.png" alt="Zstandard"></p>
+<p align="center"><img src="https://raw.githubusercontent.com/facebook/zstd/dev/doc/images/zstd_logo86.png" alt="Zstandard"></p>

 __Zstandard__, or `zstd` as short version, is a fast lossless compression algorithm,
 targeting real-time compression scenarios at zlib-level and better compression ratios.
@ -21,24 +21,25 @@ Development branch status : [![Build Status][travisDevBadge]][travisLink]   [![B
 ### Benchmarks

 For reference, several fast compression algorithms were tested and compared
-on a server running Linux Debian (`Linux version 4.8.0-1-amd64`),
+on a server running Linux Debian (`Linux version 4.14.0-3-amd64`),
 with a Core i7-6700K CPU @ 4.0GHz,
 using [lzbench], an open-source in-memory benchmark by @inikep
-compiled with GCC 6.3.0,
+compiled with [gcc] 7.3.0,
 on the [Silesia compression corpus].

 [lzbench]: https://github.com/inikep/lzbench
 [Silesia compression corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
+[gcc]: https://gcc.gnu.org/

 | Compressor name         | Ratio | Compression| Decompress.|
 | ---------------         | ------| -----------| ---------- |
-| **zstd 1.1.3 -1**       | 2.877 |   430 MB/s |  1110 MB/s |
-| zlib 1.2.8 -1           | 2.743 |   110 MB/s |   400 MB/s |
-| brotli 0.5.2 -0         | 2.708 |   400 MB/s |   430 MB/s |
+| **zstd 1.3.4 -1**       | 2.877 |   470 MB/s |  1380 MB/s |
+| zlib 1.2.11 -1          | 2.743 |   110 MB/s |   400 MB/s |
+| brotli 1.0.2 -0         | 2.701 |   410 MB/s |   430 MB/s |
 | quicklz 1.5.0 -1        | 2.238 |   550 MB/s |   710 MB/s |
 | lzo1x 2.09 -1           | 2.108 |   650 MB/s |   830 MB/s |
-| lz4 1.7.5               | 2.101 |   720 MB/s |  3600 MB/s |
-| snappy 1.1.3            | 2.091 |   500 MB/s |  1650 MB/s |
+| lz4 1.8.1               | 2.101 |   750 MB/s |  3700 MB/s |
+| snappy 1.1.4            | 2.091 |   530 MB/s |  1800 MB/s |
 | lzf 3.6 -1              | 2.077 |   400 MB/s |   860 MB/s |

 [zlib]:http://www.zlib.net/
@ -50,15 +51,15 @@ Decompression speed is preserved and remains roughly the same at all settings,
 a property shared by most LZ compression algorithms, such as [zlib] or lzma.

 The following tests were run
-on a server running Linux Debian (`Linux version 4.8.0-1-amd64`)
+on a server running Linux Debian (`Linux version 4.14.0-3-amd64`)
 with a Core i7-6700K CPU @ 4.0GHz,
 using [lzbench], an open-source in-memory benchmark by @inikep
-compiled with GCC 6.3.0,
+compiled with [gcc] 7.3.0,
 on the [Silesia compression corpus].

 Compression Speed vs Ratio | Decompression Speed
 ---------------------------|--------------------
-![Compression Speed vs Ratio](doc/images/Cspeed4.png "Compression Speed vs Ratio") | ![Decompression Speed](doc/images/Dspeed4.png "Decompression Speed")
+![Compression Speed vs Ratio](doc/images/CSpeed2.png "Compression Speed vs Ratio") | ![Decompression Speed](doc/images/DSpeed3.png "Decompression Speed")

 A few other algorithms can produce higher compression ratios at slower speeds, falling outside of the graph.
 For a larger picture including slow modes, [click on this link](doc/images/DCspeed5.png).
@ -128,8 +129,8 @@ A Meson project is provided within `contrib/meson`.

 Going into `build` directory, you will find additional possibilities:
 - Projects for Visual Studio 2005, 2008 and 2010.
-  + VS2010 project is compatible with VS2012, VS2013 and VS2015.
- Automated build scripts for Visual compiler by @KrzysFR , in `build/VS_scripts`,
+  + VS2010 project is compatible with VS2012, VS2013, VS2015 and VS2017.
+- Automated build scripts for Visual compiler by [@KrzysFR](https://github.com/KrzysFR), in `build/VS_scripts`,
  which will build `zstd` cli and `libzstd` library without any need to open Visual Studio solution.


--- a/sys/contrib/zstd/appveyor.yml
+++ b/sys/contrib/zstd/appveyor.yml
@ -2,14 +2,13 @@
  version: 1.0.{build}
  branches:
    only:
-    - dev
    - master
  environment:
    matrix:
    - COMPILER: "gcc"
      HOST:     "mingw"
      PLATFORM: "x64"
-      SCRIPT:   "make allzstd MOREFLAGS=-static && make -C tests test-symbols fullbench-dll fullbench-lib"
+      SCRIPT:   "make allzstd MOREFLAGS=-static && make -C tests test-symbols fullbench-lib"
      ARTIFACT: "true"
      BUILD:    "true"
    - COMPILER: "gcc"
@ -80,12 +79,22 @@
      SET "LDFLAGS=../../zlib/libz.a" &&
      sh -c "%SCRIPT%" &&
      ( if [%COMPILER%]==[gcc] if [%ARTIFACT%]==[true]
+          ECHO Creating artifacts &&
+          ECHO %cd% &&
          lib\dll\example\build_package.bat &&
          make -C programs DEBUGFLAGS= clean zstd &&
          cd programs\ && 7z a -tzip -mx9 zstd-win-binary-%PLATFORM%.zip zstd.exe &&
          appveyor PushArtifact zstd-win-binary-%PLATFORM%.zip &&
          cp zstd.exe ..\bin\zstd.exe &&
-          cd ..\bin\ && 7z a -tzip -mx9 zstd-win-release-%PLATFORM%.zip * &&
+          git clone --depth 1 --branch master https://github.com/facebook/zstd &&
+          cd zstd &&
+          git archive --format=tar master -o zstd-src.tar &&
+          ..\zstd -19 zstd-src.tar &&
+          appveyor PushArtifact zstd-src.tar.zst &&
+          certUtil -hashfile zstd-src.tar.zst SHA256 > zstd-src.tar.zst.sha256.sig &&
+          appveyor PushArtifact zstd-src.tar.zst.sha256.sig &&
+          cd ..\..\bin\ &&
+          7z a -tzip -mx9 zstd-win-release-%PLATFORM%.zip * &&
          appveyor PushArtifact zstd-win-release-%PLATFORM%.zip
      )
    )
--- a/sys/contrib/zstd/contrib/gen_html/Makefile
+++ b/sys/contrib/zstd/contrib/gen_html/Makefile
@ -7,10 +7,10 @@
 # in the COPYING file in the root directory of this source tree).
 # ################################################################

-CFLAGS ?= -O3
-CFLAGS += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow -Wstrict-aliasing=1 -Wswitch-enum -Wno-comment
-CFLAGS += $(MOREFLAGS)
-FLAGS   = $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS) $(LDFLAGS)
+CXXFLAGS ?= -O3
+CXXFLAGS += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow -Wstrict-aliasing=1 -Wswitch-enum -Wno-comment
+CXXFLAGS += $(MOREFLAGS)
+FLAGS   = $(CPPFLAGS) $(CXXFLAGS) $(CXXFLAGS) $(LDFLAGS)

 ZSTDAPI = ../../lib/zstd.h
 ZSTDMANUAL = ../../doc/zstd_manual.html
--- a/sys/contrib/zstd/contrib/meson/meson.build
+++ b/sys/contrib/zstd/contrib/meson/meson.build
@ -38,21 +38,45 @@ libzstd_srcs = [

 libzstd_includes = [include_directories(common_dir, dictbuilder_dir, compress_dir, lib_dir)]

-if get_option('legacy_support')
-    message('Enabling legacy support')
-    libzstd_cflags = ['-DZSTD_LEGACY_SUPPORT=4']
+legacy = get_option('legacy_support')
+if legacy == '0'
+    legacy = 'false'
+endif
+if legacy != 'false'
+    if legacy == 'true'
+        legacy = '1'
+    endif
+    #See ZSTD_LEGACY_SUPPORT of programs/README.md
+    message('Enabling legacy support back to version 0.' + legacy)
+    legacy_int = legacy.to_int()
+    if legacy_int > 7
+        legacy_int = 7
+    endif
+    libzstd_cflags = ['-DZSTD_LEGACY_SUPPORT=' + legacy]

    legacy_dir = join_paths(lib_dir, 'legacy')
    libzstd_includes += [include_directories(legacy_dir)]
-    libzstd_srcs += [
-        join_paths(legacy_dir, 'zstd_v01.c'),
-        join_paths(legacy_dir, 'zstd_v02.c'),
-        join_paths(legacy_dir, 'zstd_v03.c'),
-        join_paths(legacy_dir, 'zstd_v04.c'),
-        join_paths(legacy_dir, 'zstd_v05.c'),
-        join_paths(legacy_dir, 'zstd_v06.c'),
-        join_paths(legacy_dir, 'zstd_v07.c')
-    ]
+    if legacy_int <= 1
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v01.c')
+    endif
+    if legacy_int <= 2
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v02.c')
+    endif
+    if legacy_int <= 3
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v03.c')
+    endif
+    if legacy_int <= 4
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v04.c')
+    endif
+    if legacy_int <= 5
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v05.c')
+    endif
+    if legacy_int <= 6
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v06.c')
+    endif
+    if legacy_int <= 7
+        libzstd_srcs += join_paths(legacy_dir, 'zstd_v07.c')
+    endif
 else
    libzstd_cflags = []
 endif
@ -70,7 +94,9 @@ libzstd = library('zstd',
                  include_directories: libzstd_includes,
                  c_args: libzstd_cflags,
                  dependencies: libzstd_deps,
-                  install: true)
+                  install: true,
+                  soversion: '1',
+                 )

 programs_dir = join_paths('..', '..', 'programs')

--- a/sys/contrib/zstd/contrib/meson/meson_options.txt
+++ b/sys/contrib/zstd/contrib/meson/meson_options.txt
@ -1,2 +1,3 @@
 option('multithread', type: 'boolean', value: false)
-option('legacy_support', type: 'boolean', value: false)
+option('legacy_support', type: 'string', value: '4',
+    description: 'True or false, or 7 to 1 for v0.7+ to v0.1+.')
--- a/sys/contrib/zstd/contrib/seekable_format/zstdseek_compress.c
+++ b/sys/contrib/zstd/contrib/seekable_format/zstdseek_compress.c
@ -147,7 +147,7 @@ size_t ZSTD_seekable_initCStream(ZSTD_seekable_CStream* zcs,

    /* make sure maxFrameSize has a reasonable value */
    if (maxFrameSize > ZSTD_SEEKABLE_MAX_FRAME_DECOMPRESSED_SIZE) {
-        return ERROR(compressionParameter_unsupported);
+        return ERROR(frameParameter_unsupported);
    }

    zcs->maxFrameSize = maxFrameSize
--- a/sys/contrib/zstd/contrib/seekable_format/zstdseek_decompress.c
+++ b/sys/contrib/zstd/contrib/seekable_format/zstdseek_decompress.c
@ -125,7 +125,7 @@ static int ZSTD_seekable_seek_buff(void* opaque, S64 offset, int origin)
        newOffset = (unsigned long long)buff->size - offset;
        break;
    }
-    if (newOffset < 0 || newOffset > buff->size) {
+    if (newOffset > buff->size) {
        return -1;
    }
    buff->pos = newOffset;
@ -145,7 +145,7 @@ typedef struct {
    int checksumFlag;
 } seekTable_t;

-#define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_ABSOLUTEMAX
+#define SEEKABLE_BUFF_SIZE ZSTD_BLOCKSIZE_MAX

 struct ZSTD_seekable_s {
    ZSTD_DStream* dstream;
--- a/sys/contrib/zstd/doc/README.md
+++ b/sys/contrib/zstd/doc/README.md
@ -2,19 +2,24 @@ Zstandard Documentation
 =======================

 This directory contains material defining the Zstandard format,
-as well as for help using the `zstd` library.
+as well as detailed instructions to use `zstd` library.
+
+__`zstd_manual.html`__ : Documentation of `zstd.h` API, in html format.
+Click on this link: [http://zstd.net/zstd_manual.html](http://zstd.net/zstd_manual.html)
+to display documentation of latest release in readable format within a browser.

 __`zstd_compression_format.md`__ : This document defines the Zstandard compression format.
 Compliant decoders must adhere to this document,
 and compliant encoders must generate data that follows it.

+Should you look for ressources to develop your own port of Zstandard algorithm,
+you may find the following ressources useful :
+
 __`educational_decoder`__ : This directory contains an implementation of a Zstandard decoder,
 compliant with the Zstandard compression format.
 It can be used, for example, to better understand the format,
-or as the basis for a separate implementation a Zstandard decoder/encoder.
-
-__`zstd_manual.html`__ : Documentation on the functions found in `zstd.h`.
-See [http://zstd.net/zstd_manual.html](http://zstd.net/zstd_manual.html) for
-the manual released with the latest official `zstd` release.
-
+or as the basis for a separate implementation of Zstandard decoder.

+[__`decode_corpus`__](https://github.com/facebook/zstd/tree/dev/tests#decodecorpus---tool-to-generate-zstandard-frames-for-decoder-testing) :
+This tool, stored in `/tests` directory, is able to generate random valid frames,
+which is useful if you wish to test your decoder and verify it fully supports the specification.
--- a/sys/contrib/zstd/doc/images/CSpeed2.png
+++ b/sys/contrib/zstd/doc/images/CSpeed2.png
--- a/sys/contrib/zstd/doc/images/Cspeed4.png
+++ b/sys/contrib/zstd/doc/images/Cspeed4.png
--- a/sys/contrib/zstd/doc/images/DSpeed3.png
+++ b/sys/contrib/zstd/doc/images/DSpeed3.png
--- a/sys/contrib/zstd/doc/images/Dspeed4.png
+++ b/sys/contrib/zstd/doc/images/Dspeed4.png
--- a/sys/contrib/zstd/doc/images/dict-cr.png
+++ b/sys/contrib/zstd/doc/images/dict-cr.png
--- a/sys/contrib/zstd/doc/images/dict-cs.png
+++ b/sys/contrib/zstd/doc/images/dict-cs.png
--- a/sys/contrib/zstd/doc/images/dict-ds.png
+++ b/sys/contrib/zstd/doc/images/dict-ds.png
--- a/sys/contrib/zstd/doc/zstd_compression_format.md
+++ b/sys/contrib/zstd/doc/zstd_compression_format.md
@ -257,7 +257,7 @@ a decoder is allowed to reject a compressed frame
 which requests a memory size beyond decoder's authorized range.

 For improved interoperability,
-decoders are recommended to be compatible with `Window_Size >= 8 MB`,
+decoders are recommended to be compatible with `Window_Size <= 8 MB`,
 and encoders are recommended to not request more than 8 MB.
 It's merely a recommendation though,
 decoders are free to support larger or lower limits,
--- a/sys/contrib/zstd/doc/zstd_manual.html
+++ b/sys/contrib/zstd/doc/zstd_manual.html
@ -1,17 +1,17 @@
 <html>
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<title>zstd 1.3.3 Manual</title>
+<title>zstd 1.3.4 Manual</title>
 </head>
 <body>
-<h1>zstd 1.3.3 Manual</h1>
+<h1>zstd 1.3.4 Manual</h1>
 <hr>
 <a name="Contents"></a><h2>Contents</h2>
 <ol>
 <li><a href="#Chapter1">Introduction</a></li>
 <li><a href="#Chapter2">Version</a></li>
 <li><a href="#Chapter3">Simple API</a></li>
-<li><a href="#Chapter4">Explicit memory management</a></li>
+<li><a href="#Chapter4">Explicit context</a></li>
 <li><a href="#Chapter5">Simple dictionary API</a></li>
 <li><a href="#Chapter6">Bulk processing dictionary API</a></li>
 <li><a href="#Chapter7">Streaming</a></li>
@ -19,17 +19,16 @@
 <li><a href="#Chapter9">Streaming decompression - HowTo</a></li>
 <li><a href="#Chapter10">START OF ADVANCED AND EXPERIMENTAL FUNCTIONS</a></li>
 <li><a href="#Chapter11">Advanced types</a></li>
-<li><a href="#Chapter12">Custom memory allocation functions</a></li>
-<li><a href="#Chapter13">Frame size functions</a></li>
-<li><a href="#Chapter14">Context memory usage</a></li>
-<li><a href="#Chapter15">Advanced compression functions</a></li>
-<li><a href="#Chapter16">Advanced decompression functions</a></li>
-<li><a href="#Chapter17">Advanced streaming functions</a></li>
-<li><a href="#Chapter18">Buffer-less and synchronous inner streaming functions</a></li>
-<li><a href="#Chapter19">Buffer-less streaming compression (synchronous mode)</a></li>
-<li><a href="#Chapter20">Buffer-less streaming decompression (synchronous mode)</a></li>
-<li><a href="#Chapter21">New advanced API (experimental)</a></li>
-<li><a href="#Chapter22">Block level API</a></li>
+<li><a href="#Chapter12">Frame size functions</a></li>
+<li><a href="#Chapter13">Memory management</a></li>
+<li><a href="#Chapter14">Advanced compression functions</a></li>
+<li><a href="#Chapter15">Advanced decompression functions</a></li>
+<li><a href="#Chapter16">Advanced streaming functions</a></li>
+<li><a href="#Chapter17">Buffer-less and synchronous inner streaming functions</a></li>
+<li><a href="#Chapter18">Buffer-less streaming compression (synchronous mode)</a></li>
+<li><a href="#Chapter19">Buffer-less streaming decompression (synchronous mode)</a></li>
+<li><a href="#Chapter20">New advanced API (experimental)</a></li>
+<li><a href="#Chapter21">Block level API</a></li>
 </ol>
 <hr>
 <a name="Chapter1"></a><h2>Introduction</h2><pre>
@ -40,11 +39,11 @@
  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
  Compression can be done in:
    - a single step (described as Simple API)
-    - a single step, reusing a context (described as Explicit memory management)
+    - a single step, reusing a context (described as Explicit context)
    - unbounded multiple steps (described as Streaming compression)
  The compression ratio achievable on small data can be highly improved using a dictionary in:
    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Fast dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)

  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
  Advanced experimental APIs shall never be used with a dynamic library.
@ -103,22 +102,20 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);

 <pre><b>unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 </b><p>  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
-  Both functions work the same way,
-  but ZSTD_getDecompressedSize() blends
-  "empty", "unknown" and "error" results in the same return value (0),
-  while ZSTD_getFrameContentSize() distinguishes them.
-
-  'src' is the start of a zstd compressed frame.
-  @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. 
+  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+  "empty", "unknown" and "error" results to the same return value (0),
+  while ZSTD_getFrameContentSize() gives them separate return values.
+ `src` is the start of a zstd compressed frame.
+ @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. 
 </p></pre><BR>

 <h3>Helper functions</h3><pre></pre><b><pre>#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) </b>/* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */<b>
-size_t      ZSTD_compressBound(size_t srcSize); </b>/*!< maximum compressed size in worst case scenario */<b>
+size_t      ZSTD_compressBound(size_t srcSize); </b>/*!< maximum compressed size in worst case single-pass scenario */<b>
 unsigned    ZSTD_isError(size_t code);          </b>/*!< tells if a `size_t` function result is an error code */<b>
 const char* ZSTD_getErrorName(size_t code);     </b>/*!< provides readable string from an error code */<b>
 int         ZSTD_maxCLevel(void);               </b>/*!< maximum compression level available */<b>
 </pre></b><BR>
-<a name="Chapter4"></a><h2>Explicit memory management</h2><pre></pre>
+<a name="Chapter4"></a><h2>Explicit context</h2><pre></pre>

 <h3>Compression context</h3><pre>  When compressing many times,
  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
@ -347,11 +344,18 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
    ZSTD_frameParameters fParams;
 } ZSTD_parameters;
 </b></pre><BR>
-<a name="Chapter12"></a><h2>Custom memory allocation functions</h2><pre></pre>
-
-<pre><b>typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+<pre><b>typedef enum {
+    ZSTD_dct_auto=0,      </b>/* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */<b>
+    ZSTD_dct_rawContent,  </b>/* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */<b>
+    ZSTD_dct_fullDict     </b>/* refuses to load a dictionary if it does not respect Zstandard's specification */<b>
+} ZSTD_dictContentType_e;
 </b></pre><BR>
-<a name="Chapter13"></a><h2>Frame size functions</h2><pre></pre>
+<pre><b>typedef enum {
+    ZSTD_dlm_byCopy = 0, </b>/**< Copy dictionary content internally */<b>
+    ZSTD_dlm_byRef,      </b>/**< Reference dictionary content -- the dictionary buffer must outlive its users. */<b>
+} ZSTD_dictLoadMethod_e;
+</b></pre><BR>
+<a name="Chapter12"></a><h2>Frame size functions</h2><pre></pre>

 <pre><b>size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
 </b><p>  `src` should point to the start of a ZSTD encoded frame or skippable frame
@ -390,7 +394,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
   @return : size of the Frame Header 
 </p></pre><BR>

-<a name="Chapter14"></a><h2>Context memory usage</h2><pre></pre>
+<a name="Chapter13"></a><h2>Memory management</h2><pre></pre>

 <pre><b>size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
 size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
@ -399,7 +403,7 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
 size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
 size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 </b><p>  These functions give the current memory usage of selected object.
-  Object memory usage can evolve when re-used multiple times. 
+  Object memory usage can evolve when re-used. 
 </p></pre><BR>

 <pre><b>size_t ZSTD_estimateCCtxSize(int compressionLevel);
@ -412,8 +416,8 @@ size_t ZSTD_estimateDCtxSize(void);
  It will also consider src size to be arbitrarily "large", which is worst case.
  If srcSize is known to always be small, ZSTD_estimateCCtxSize_usingCParams() can provide a tighter estimation.
  ZSTD_estimateCCtxSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
-  ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbThreads is > 1.
-  Note : CCtx estimation is only correct for single-threaded compression 
+  ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+  Note : CCtx size estimation is only correct for single-threaded compression. 
 </p></pre><BR>

 <pre><b>size_t ZSTD_estimateCStreamSize(int compressionLevel);
@ -425,8 +429,8 @@ size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
  It will also consider src size to be arbitrarily "large", which is worst case.
  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
-  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbThreads is set to a value > 1.
-  Note : CStream estimation is only correct for single-threaded compression.
+  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+  Note : CStream size estimation is only correct for single-threaded compression.
  ZSTD_DStream memory budget depends on window Size.
  This information can be passed manually, using ZSTD_estimateDStreamSize,
  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
@ -435,83 +439,59 @@ size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
         In this case, get total size by adding ZSTD_estimate?DictSize 
 </p></pre><BR>

-<pre><b>typedef enum {
-    ZSTD_dlm_byCopy = 0,     </b>/**< Copy dictionary content internally */<b>
-    ZSTD_dlm_byRef,          </b>/**< Reference dictionary content -- the dictionary buffer must outlive its users. */<b>
-} ZSTD_dictLoadMethod_e;
-</b></pre><BR>
 <pre><b>size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
 size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
 size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
 </b><p>  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
-  ZSTD_estimateCStreamSize_advanced_usingCParams() makes it possible to control precisely compression parameters, like ZSTD_createCDict_advanced().
-  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller
+  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
 
 </p></pre><BR>

-<a name="Chapter15"></a><h2>Advanced compression functions</h2><pre></pre>
-
-<pre><b>ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
-</b><p>  Create a ZSTD compression context using external alloc and free functions 
-</p></pre><BR>
-
-<pre><b>ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
-</b><p>  workspace: The memory area to emplace the context into.
-             Provided pointer must 8-bytes aligned.
-             It must outlive context usage.
-  workspaceSize: Use ZSTD_estimateCCtxSize() or ZSTD_estimateCStreamSize()
-                 to determine how large workspace must be to support scenario.
- @return : pointer to ZSTD_CCtx* (same address as workspace, but different type),
-           or NULL if error (typically size too small)
-  Note : zstd will never resize nor malloc() when using a static cctx.
-         If it needs more memory than available, it will simply error out.
+<pre><b>ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    </b>/**< same as ZSTD_initStaticCCtx() */<b>
+</b><p>  Initialize an object using a pre-allocated fixed-size buffer.
+  workspace: The memory area to emplace the object into.
+             Provided pointer *must be 8-bytes aligned*.
+             Buffer must outlive object.
+  workspaceSize: Use ZSTD_estimate*Size() to determine
+                 how large workspace must be to support target scenario.
+ @return : pointer to object (same address as workspace, just different type),
+           or NULL if error (size too small, incorrect alignment, etc.)
+  Note : zstd will never resize nor malloc() when using a static buffer.
+         If the object requires more memory than available,
+         zstd will just error out (typically ZSTD_error_memory_allocation).
  Note 2 : there is no corresponding "free" function.
-           Since workspace was allocated externally, it must be freed externally too.
-  Limitation 1 : currently not compatible with internal CDict creation, such as
-                 ZSTD_CCtx_loadDictionary() or ZSTD_initCStream_usingDict().
-  Limitation 2 : currently not compatible with multi-threading
+           Since workspace is allocated externally, it must be freed externally too.
+  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+           into its associated cParams.
+  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+  Limitation 2 : static cctx currently not compatible with multi-threading.
+  Limitation 3 : static dctx is incompatible with legacy support.
 
 </p></pre><BR>

+<pre><b>ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    </b>/**< same as ZSTD_initStaticDCtx() */<b>
+</b></pre><BR>
+<pre><b>typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  </b>/**< this constant defers to stdlib's functions */<b>
+</b><p>  These prototypes make it possible to pass your own allocation/free functions.
+  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ 
+</p></pre><BR>
+
+<a name="Chapter14"></a><h2>Advanced compression functions</h2><pre></pre>
+
 <pre><b>ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
 </b><p>  Create a digested dictionary for compression
  Dictionary content is simply referenced, and therefore stays in dictBuffer.
  It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict 
 </p></pre><BR>

-<pre><b>typedef enum { ZSTD_dm_auto=0,        </b>/* dictionary is "full" if it starts with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */<b>
-               ZSTD_dm_rawContent,    </b>/* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */<b>
-               ZSTD_dm_fullDict       </b>/* refuses to load a dictionary if it does not respect Zstandard's specification */<b>
-} ZSTD_dictMode_e;
-</b></pre><BR>
-<pre><b>ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_dictMode_e dictMode,
-                                      ZSTD_compressionParameters cParams,
-                                      ZSTD_customMem customMem);
-</b><p>  Create a ZSTD_CDict using external alloc and free, and customized compression parameters 
-</p></pre><BR>
-
-<pre><b>ZSTD_CDict* ZSTD_initStaticCDict(
-                void* workspace, size_t workspaceSize,
-          const void* dict, size_t dictSize,
-                ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictMode_e dictMode,
-                ZSTD_compressionParameters cParams);
-</b><p>  Generate a digested dictionary in provided memory area.
-  workspace: The memory area to emplace the dictionary into.
-             Provided pointer must 8-bytes aligned.
-             It must outlive dictionary usage.
-  workspaceSize: Use ZSTD_estimateCDictSize()
-                 to determine how large workspace must be.
-  cParams : use ZSTD_getCParams() to transform a compression level
-            into its relevants cParams.
- @return : pointer to ZSTD_CDict* (same address as workspace, but different type),
-           or NULL if error (typically, size too small).
-  Note : there is no corresponding "free" function.
-         Since workspace was allocated externally, it must be freed externally.
- 
-</p></pre><BR>
-
 <pre><b>ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
 </b><p>   @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
   `estimatedSrcSize` value is optional, select 0 if not known 
@ -546,7 +526,7 @@ size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMet
 </b><p>   Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters 
 </p></pre><BR>

-<a name="Chapter16"></a><h2>Advanced decompression functions</h2><pre></pre>
+<a name="Chapter15"></a><h2>Advanced decompression functions</h2><pre></pre>

 <pre><b>unsigned ZSTD_isFrame(const void* buffer, size_t size);
 </b><p>  Tells if the content of `buffer` starts with a valid Frame Identifier.
@ -555,28 +535,6 @@ size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMet
  Note 3 : Skippable Frame Identifiers are considered valid. 
 </p></pre><BR>

-<pre><b>ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
-</b><p>  Create a ZSTD decompression context using external alloc and free functions 
-</p></pre><BR>
-
-<pre><b>ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
-</b><p>  workspace: The memory area to emplace the context into.
-             Provided pointer must 8-bytes aligned.
-             It must outlive context usage.
-  workspaceSize: Use ZSTD_estimateDCtxSize() or ZSTD_estimateDStreamSize()
-                 to determine how large workspace must be to support scenario.
- @return : pointer to ZSTD_DCtx* (same address as workspace, but different type),
-           or NULL if error (typically size too small)
-  Note : zstd will never resize nor malloc() when using a static dctx.
-         If it needs more memory than available, it will simply error out.
-  Note 2 : static dctx is incompatible with legacy support
-  Note 3 : there is no corresponding "free" function.
-           Since workspace was allocated externally, it must be freed externally.
-  Limitation : currently not compatible with internal DDict creation,
-               such as ZSTD_initDStream_usingDict().
- 
-</p></pre><BR>
-
 <pre><b>ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
 </b><p>  Create a digested dictionary, ready to start decompression operation without startup delay.
  Dictionary content is referenced, and therefore stays in dictBuffer.
@ -584,27 +542,6 @@ size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMet
  it must remain read accessible throughout the lifetime of DDict 
 </p></pre><BR>

-<pre><b>ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
-                                      ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_customMem customMem);
-</b><p>  Create a ZSTD_DDict using external alloc and free, optionally by reference 
-</p></pre><BR>
-
-<pre><b>ZSTD_DDict* ZSTD_initStaticDDict(void* workspace, size_t workspaceSize,
-                                 const void* dict, size_t dictSize,
-                                 ZSTD_dictLoadMethod_e dictLoadMethod);
-</b><p>  Generate a digested dictionary in provided memory area.
-  workspace: The memory area to emplace the dictionary into.
-             Provided pointer must 8-bytes aligned.
-             It must outlive dictionary usage.
-  workspaceSize: Use ZSTD_estimateDDictSize()
-                 to determine how large workspace must be.
- @return : pointer to ZSTD_DDict*, or NULL if error (size too small)
-  Note : there is no corresponding "free" function.
-         Since workspace was allocated externally, it must be freed externally.
- 
-</p></pre><BR>
-
 <pre><b>unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
 </b><p>  Provides the dictID stored within dictionary.
  if @return == 0, the dictionary is not conformant with Zstandard specification.
@ -629,11 +566,9 @@ size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMet
  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. 
 </p></pre><BR>

-<a name="Chapter17"></a><h2>Advanced streaming functions</h2><pre></pre>
+<a name="Chapter16"></a><h2>Advanced streaming functions</h2><pre></pre>

-<h3>Advanced Streaming compression functions</h3><pre></pre><b><pre>ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
-ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    </b>/**< same as ZSTD_initStaticCCtx() */<b>
-size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   </b>/**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */<b>
+<h3>Advanced Streaming compression functions</h3><pre></pre><b><pre>size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   </b>/**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */<b>
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); </b>/**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. Note: dict is loaded with ZSTD_dm_auto (treated as a full zstd dictionary if it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.*/<b>
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
                                             ZSTD_parameters params, unsigned long long pledgedSrcSize);  </b>/**< pledgedSrcSize must be correct. If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. dict is loaded with ZSTD_dm_auto and ZSTD_dlm_byCopy. */<b>
@ -647,26 +582,30 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict*
  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
-  but it may change to mean "empty" in some future version, so prefer using macro ZSTD_CONTENTSIZE_UNKNOWN.
+  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
 @return : 0, or an error code (which can be tested using ZSTD_isError()) 
 </p></pre><BR>

-<h3>Advanced Streaming decompression functions</h3><pre></pre><b><pre>ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
-ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    </b>/**< same as ZSTD_initStaticDCtx() */<b>
-typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
+<pre><b>typedef struct {
+    unsigned long long ingested;
+    unsigned long long consumed;
+    unsigned long long produced;
+} ZSTD_frameProgression;
+</b></pre><BR>
+<h3>Advanced Streaming decompression functions</h3><pre></pre><b><pre>typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
 size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);   </b>/* obsolete : this API will be removed in a future version */<b>
 size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); </b>/**< note: no dictionary will be used if dict == NULL or dictSize < 8 */<b>
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  </b>/**< note : ddict is referenced, it must outlive decompression session */<b>
 size_t ZSTD_resetDStream(ZSTD_DStream* zds);  </b>/**< re-use decompression parameters from previous init; saves dictionary loading */<b>
 </pre></b><BR>
-<a name="Chapter18"></a><h2>Buffer-less and synchronous inner streaming functions</h2><pre>
+<a name="Chapter17"></a><h2>Buffer-less and synchronous inner streaming functions</h2><pre>
  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
  But it's also a complex one, with several restrictions, documented below.
  Prefer normal streaming API for an easier experience.
 
 <BR></pre>

-<a name="Chapter19"></a><h2>Buffer-less streaming compression (synchronous mode)</h2><pre>
+<a name="Chapter18"></a><h2>Buffer-less streaming compression (synchronous mode)</h2><pre>
  A ZSTD_CCtx object is required to track streaming operations.
  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
@ -702,7 +641,7 @@ size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
 size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   </b>/* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */<b>
 size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); </b>/**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */<b>
 </pre></b><BR>
-<a name="Chapter20"></a><h2>Buffer-less streaming decompression (synchronous mode)</h2><pre>
+<a name="Chapter19"></a><h2>Buffer-less streaming decompression (synchronous mode)</h2><pre>
  A ZSTD_DCtx object is required to track streaming operations.
  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
  A ZSTD_DCtx object can be re-used multiple times.
@ -788,15 +727,15 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
 </pre></b><BR>
 <pre><b>typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 </b></pre><BR>
-<a name="Chapter21"></a><h2>New advanced API (experimental)</h2><pre></pre>
+<a name="Chapter20"></a><h2>New advanced API (experimental)</h2><pre></pre>

 <pre><b>typedef enum {
-    </b>/* Question : should we have a format ZSTD_f_auto ?<b>
-     * For the time being, it would mean exactly the same as ZSTD_f_zstd1.
-     * But, in the future, should several formats be supported,
+    </b>/* Opened question : should we have a format ZSTD_f_auto ?<b>
+     * Today, it would mean exactly the same as ZSTD_f_zstd1.
+     * But, in the future, should several formats become supported,
     * on the compression side, it would mean "default format".
-     * On the decompression side, it would mean "multi format",
-     * and ZSTD_f_zstd1 could be reserved to mean "accept *only* zstd frames".
+     * On the decompression side, it would mean "automatic format detection",
+     * so that ZSTD_f_zstd1 would mean "accept *only* zstd frames".
     * Since meaning is a little different, another option could be to define different enums for compression and decompression.
     * This question could be kept for later, when there are actually multiple formats to support,
     * but there is also the question of pinning enum values, and pinning value `0` is especially important */
@ -814,42 +753,76 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
    </b>/* compression parameters */<b>
    ZSTD_p_compressionLevel=100, </b>/* Update all compression parameters according to pre-defined cLevel table<b>
                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
-                              * Special: value 0 means "do not change cLevel". */
+                              * Special: value 0 means "do not change cLevel".
+                              * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
+                              * Note 2 : setting a level sets all default values of other compression parameters.
+                              * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
    ZSTD_p_windowLog,        </b>/* Maximum allowed back-reference distance, expressed as power of 2.<b>
                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
-                              * Special: value 0 means "do not change windowLog".
+                              * Special: value 0 means "use default windowLog".
                              * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
-                              * requires setting the maximum window size at least as large during decompression. */
+                              *       requires explicitly allowing such window size during decompression stage. */
    ZSTD_p_hashLog,          </b>/* Size of the probe table, as a power of 2.<b>
                              * Resulting table size is (1 << (hashLog+2)).
                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
                              * Larger tables improve compression ratio of strategies <= dFast,
                              * and improve speed of strategies > dFast.
-                              * Special: value 0 means "do not change hashLog". */
+                              * Special: value 0 means "use default hashLog". */
    ZSTD_p_chainLog,         </b>/* Size of the full-search table, as a power of 2.<b>
                              * Resulting table size is (1 << (chainLog+2)).
                              * Larger tables result in better and slower compression.
                              * This parameter is useless when using "fast" strategy.
-                              * Special: value 0 means "do not change chainLog". */
+                              * Special: value 0 means "use default chainLog". */
    ZSTD_p_searchLog,        </b>/* Number of search attempts, as a power of 2.<b>
                              * More attempts result in better and slower compression.
                              * This parameter is useless when using "fast" and "dFast" strategies.
-                              * Special: value 0 means "do not change searchLog". */
+                              * Special: value 0 means "use default searchLog". */
    ZSTD_p_minMatch,         </b>/* Minimum size of searched matches (note : repCode matches can be smaller).<b>
                              * Larger values make faster compression and decompression, but decrease ratio.
                              * Must be clamped between ZSTD_SEARCHLENGTH_MIN and ZSTD_SEARCHLENGTH_MAX.
                              * Note that currently, for all strategies < btopt, effective minimum is 4.
-                              * Note that currently, for all strategies > fast, effective maximum is 6.
-                              * Special: value 0 means "do not change minMatchLength". */
-    ZSTD_p_targetLength,     </b>/* Only useful for strategies >= btopt.<b>
-                              * Length of Match considered "good enough" to stop search.
-                              * Larger values make compression stronger and slower.
-                              * Special: value 0 means "do not change targetLength". */
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_p_targetLength,     </b>/* Impact of this field depends on strategy.<b>
+                              * For strategies btopt & btultra:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
    ZSTD_p_compressionStrategy, </b>/* See ZSTD_strategy enum definition.<b>
                              * Cast selected strategy as unsigned for ZSTD_CCtx_setParameter() compatibility.
                              * The higher the value of selected strategy, the more complex it is,
                              * resulting in stronger and slower compression.
-                              * Special: value 0 means "do not change strategy". */
+                              * Special: value 0 means "use default strategy". */
+
+    ZSTD_p_enableLongDistanceMatching=160, </b>/* Enable long distance matching.<b>
+                                         * This parameter is designed to improve compression ratio
+                                         * for large inputs, by finding large matches at long distance.
+                                         * It increases memory usage and window size.
+                                         * Note: enabling this parameter increases ZSTD_p_windowLog to 128 MB
+                                         * except when expressly set to a different value. */
+    ZSTD_p_ldmHashLog,       </b>/* Size of the table for long distance matching, as a power of 2.<b>
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_p_ldmMinMatch,      </b>/* Minimum match size for long distance matcher.<b>
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_p_ldmBucketSizeLog, </b>/* Log size of each bucket in the LDM hash table for collision resolution.<b>
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX .
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_p_ldmHashEveryLog,  </b>/* Frequency of inserting/looking up entries in the LDM hash table.<b>
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashEveryLog". */

    </b>/* frame parameters */<b>
    ZSTD_p_contentSizeFlag=200, </b>/* Content size will be written into frame header _whenever known_ (default:1)<b>
@ -859,57 +832,44 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
    ZSTD_p_dictIDFlag,       </b>/* When applicable, dictionary's ID is written into frame header (default:1) */<b>

    </b>/* multi-threading parameters */<b>
-    ZSTD_p_nbThreads=400,    </b>/* Select how many threads a compression job can spawn (default:1)<b>
-                              * More threads improve speed, but also increase memory usage.
-                              * Can only receive a value > 1 if ZSTD_MULTITHREAD is enabled.
-                              * Special: value 0 means "do not change nbThreads" */
-    ZSTD_p_jobSize,          </b>/* Size of a compression job. This value is only enforced in streaming (non-blocking) mode.<b>
-                              * Each compression job is completed in parallel, so indirectly controls the nb of active threads.
+    </b>/* These parameters are only useful if multi-threading is enabled (ZSTD_MULTITHREAD).<b>
+     * They return an error otherwise. */
+    ZSTD_p_nbWorkers=400,    </b>/* Select how many threads will be spawned to compress in parallel.<b>
+                              * When nbWorkers >= 1, triggers asynchronous mode :
+                              * ZSTD_compress_generic() consumes some input, flush some output if possible, and immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation sets ZSTD_e_end : it becomes a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+    ZSTD_p_jobSize,          </b>/* Size of a compression job. This value is enforced only in non-blocking mode.<b>
+                              * Each compression job is completed in parallel, so this value indirectly controls the nb of active threads.
                              * 0 means default, which is dynamically determined based on compression parameters.
-                              * Job size must be a minimum of overlapSize, or 1 KB, whichever is largest
+                              * Job size must be a minimum of overlapSize, or 1 MB, whichever is largest.
                              * The minimum size is automatically and transparently enforced */
    ZSTD_p_overlapSizeLog,   </b>/* Size of previous input reloaded at the beginning of each job.<b>
                              * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */

-    </b>/* advanced parameters - may not remain available after API update */<b>
+    </b>/* =================================================================== */<b>
+    </b>/* experimental parameters - no stability guaranteed                   */<b>
+    </b>/* =================================================================== */<b>
+
+    ZSTD_p_compressLiterals=1000, </b>/* control huffman compression of literals (enabled) by default.<b>
+                              * disabling it improves speed and decreases compression ratio by a large amount.
+                              * note : this setting is automatically updated when changing compression level.
+                              *        positive compression levels set ZSTD_p_compressLiterals to 1.
+                              *        negative compression levels set ZSTD_p_compressLiterals to 0. */
+
    ZSTD_p_forceMaxWindow=1100, </b>/* Force back-reference distances to remain < windowSize,<b>
                              * even when referencing into Dictionary content (default:0) */
-    ZSTD_p_enableLongDistanceMatching=1200,  </b>/* Enable long distance matching.<b>
-                                         * This parameter is designed to improve the compression
-                                         * ratio for large inputs with long distance matches.
-                                         * This increases the memory usage as well as window size.
-                                         * Note: setting this parameter sets all the LDM parameters
-                                         * as well as ZSTD_p_windowLog. It should be set after
-                                         * ZSTD_p_compressionLevel and before ZSTD_p_windowLog and
-                                         * other LDM parameters. Setting the compression level
-                                         * after this parameter overrides the window log, though LDM
-                                         * will remain enabled until explicitly disabled. */
-    ZSTD_p_ldmHashLog,   </b>/* Size of the table for long distance matching, as a power of 2.<b>
-                          * Larger values increase memory usage and compression ratio, but decrease
-                          * compression speed.
-                          * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
-                          * (default: windowlog - 7). */
-    ZSTD_p_ldmMinMatch,  </b>/* Minimum size of searched matches for long distance matcher.<b>
-                          * Larger/too small values usually decrease compression ratio.
-                          * Must be clamped between ZSTD_LDM_MINMATCH_MIN
-                          * and ZSTD_LDM_MINMATCH_MAX (default: 64). */
-    ZSTD_p_ldmBucketSizeLog,  </b>/* Log size of each bucket in the LDM hash table for collision resolution.<b>
-                               * Larger values usually improve collision resolution but may decrease
-                               * compression speed.
-                               * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX (default: 3). */
-    ZSTD_p_ldmHashEveryLog,  </b>/* Frequency of inserting/looking up entries in the LDM hash table.<b>
-                              * The default is MAX(0, (windowLog - ldmHashLog)) to
-                              * optimize hash table usage.
-                              * Larger values improve compression speed. Deviating far from the
-                              * default value will likely result in a decrease in compression ratio.
-                              * Must be clamped between 0 and ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN. */

 } ZSTD_cParameter;
 </b></pre><BR>
 <pre><b>size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
 </b><p>  Set one compression parameter, selected by enum ZSTD_cParameter.
+  Setting a parameter is generally only possible during frame initialization (before starting compression),
+  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
  Note : when `value` is an enum, cast it to unsigned for proper type checking.
-  @result : informational value (typically, the one being set, possibly corrected),
+  @result : informational value (typically, value being set clamped correctly),
            or an error code (which can be tested with ZSTD_isError()). 
 </p></pre><BR>

@ -926,24 +886,22 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long

 <pre><b>size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
-size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictMode_e dictMode);
-</b><p>  Create an internal CDict from dict buffer.
-  Decompression will have to use same buffer.
+size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+</b><p>  Create an internal CDict from `dict` buffer.
+  Decompression will have to use same dictionary.
 @result : 0, or an error code (which can be tested with ZSTD_isError()).
-  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
-            meaning "return to no-dictionary mode".
-  Note 1 : `dict` content will be copied internally. Use
-            ZSTD_CCtx_loadDictionary_byReference() to reference dictionary
-            content instead. The dictionary buffer must then outlive its
-            users.
+  Special: Adding a NULL (or 0-size) dictionary invalidates previous dictionary,
+           meaning "return to no-dictionary mode".
+  Note 1 : Dictionary will be used for all future compression jobs.
+           To return to "no-dictionary" situation, load a NULL dictionary
  Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters.
           For this reason, compression parameters cannot be changed anymore after loading a dictionary.
-           It's also a CPU-heavy operation, with non-negligible impact on latency.
-  Note 3 : Dictionary will be used for all future compression jobs.
-           To return to "no-dictionary" situation, load a NULL dictionary
-  Note 5 : Use ZSTD_CCtx_loadDictionary_advanced() to select how dictionary
-           content will be interpreted.
- 
+           It's also a CPU consuming operation, with non-negligible impact on latency.
+  Note 3 :`dict` content will be copied internally.
+           Use ZSTD_CCtx_loadDictionary_byReference() to reference dictionary content instead.
+           In such a case, dictionary buffer must outlive its users.
+  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+           to precisely select how dictionary content must be interpreted. 
 </p></pre><BR>

 <pre><b>size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
@ -955,29 +913,37 @@ size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size
  Special : adding a NULL CDict means "return to no-dictionary mode".
  Note 1 : Currently, only one dictionary can be managed.
           Adding a new dictionary effectively "discards" any previous one.
-  Note 2 : CDict is just referenced, its lifetime must outlive CCtx.
- 
+  Note 2 : CDict is just referenced, its lifetime must outlive CCtx. 
 </p></pre><BR>

 <pre><b>size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
-size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictMode_e dictMode);
+size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
  Decompression need same prefix to properly regenerate data.
  Prefix is **only used once**. Tables are discarded at end of compression job.
  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
 @result : 0, or an error code (which can be tested with ZSTD_isError()).
-  Special : Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  Note 1 : Prefix buffer is referenced. It must outlive compression job.
  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
-           It's a CPU-heavy operation, with non-negligible impact on latency.
-  Note 3 : By default, the prefix is treated as raw content
-           (ZSTD_dm_rawContent). Use ZSTD_CCtx_refPrefix_advanced() to alter
-           dictMode. 
+           It's a CPU consuming operation, with non-negligible impact on latency.
+  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. 
+</p></pre><BR>
+
+<pre><b>void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);
+</b><p>  Return a CCtx to clean state.
+  Useful after an error, or to interrupt an ongoing compression job and start a new one.
+  Any internal data not yet flushed is cancelled.
+  Dictionary (if any) is dropped.
+  All parameters are back to default values.
+  It's possible to modify compression parameters after a reset.
+ 
 </p></pre><BR>

 <pre><b>typedef enum {
-    ZSTD_e_continue=0, </b>/* collect more data, encoder transparently decides when to output result, for optimal conditions */<b>
+    ZSTD_e_continue=0, </b>/* collect more data, encoder decides when to output compressed result, for optimal conditions */<b>
    ZSTD_e_flush,      </b>/* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */<b>
    ZSTD_e_end         </b>/* flush any remaining data and close current frame. Any additional data starts a new frame. */<b>
 } ZSTD_EndDirective;
@ -996,10 +962,11 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
                                                     and then immediately returns, just indicating that there is some data remaining to be flushed.
                                                     The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
  - Exception : in multi-threading mode, if the first call requests a ZSTD_e_end directive, it is blocking : it will complete compression before giving back control to caller.
-  - @return provides the minimum amount of data remaining to be flushed from internal buffers
+  - @return provides a minimum amount of data remaining to be flushed from internal buffers
            or an error code, which can be tested using ZSTD_isError().
            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
-            This is useful to determine if a ZSTD_e_flush or ZSTD_e_end directive is completed.
+            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
            Before starting a new compression job, or changing compression parameters,
@ -1007,16 +974,6 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
 
 </p></pre><BR>

-<pre><b>void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);   </b>/* Not ready yet ! */<b>
-</b><p>  Return a CCtx to clean state.
-  Useful after an error, or to interrupt an ongoing compression job and start a new one.
-  Any internal data not yet flushed is cancelled.
-  Dictionary (if any) is dropped.
-  All parameters are back to default values.
-  It's possible to modify compression parameters after a reset.
- 
-</p></pre><BR>
-
 <pre><b>size_t ZSTD_compress_generic_simpleArgs (
                ZSTD_CCtx* cctx,
                void* dst, size_t dstCapacity, size_t* dstPos,
@ -1031,6 +988,7 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
 </p></pre><BR>

 <pre><b>ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
 </b><p>  Quick howto :
  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
  - ZSTD_CCtxParam_setParameter() : Push parameters one by one into
@ -1049,18 +1007,18 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
 
 </p></pre><BR>

-<pre><b>size_t ZSTD_resetCCtxParams(ZSTD_CCtx_params* params);
-</b><p>  Reset params to default, with the default compression level.
+<pre><b>size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+</b><p>  Reset params to default values.
 
 </p></pre><BR>

-<pre><b>size_t ZSTD_initCCtxParams(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+<pre><b>size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
 </b><p>  Initializes the compression parameters of cctxParams according to
  compression level. All other parameters are reset to their default values.
 
 </p></pre><BR>

-<pre><b>size_t ZSTD_initCCtxParams_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+<pre><b>size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
 </b><p>  Initializes the compression and frame parameters of cctxParams according to
  params. All other parameters are reset to their default values.
 
@ -1078,16 +1036,17 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
 <pre><b>size_t ZSTD_CCtx_setParametersUsingCCtxParams(
        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
 </b><p>  Apply a set of ZSTD_CCtx_params to the compression context.
-  This must be done before the dictionary is loaded.
-  The pledgedSrcSize is treated as unknown.
-  Multithreading parameters are applied only if nbThreads > 1.
+  This can be done even after compression is started,
+    if nbWorkers==0, this will have no impact until a new compression is started.
+    if nbWorkers>=1, new parameters will be picked up at next job,
+       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
 
 </p></pre><BR>

 <h3>Advanced parameters for decompression API</h3><pre></pre><b><pre></pre></b><BR>
-<pre><b>size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);   </b>/* not implemented */<b>
-size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);   </b>/* not implemented */<b>
-size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictMode_e dictMode);   </b>/* not implemented */<b>
+<pre><b>size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
 </b><p>  Create an internal DDict from dict buffer,
  to be used to decompress next frames.
 @result : 0, or an error code (which can be tested with ZSTD_isError()).
@ -1104,7 +1063,7 @@ size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size
 
 </p></pre><BR>

-<pre><b>size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);   </b>/* not implemented */<b>
+<pre><b>size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
 </b><p>  Reference a prepared dictionary, to be used to decompress next frames.
  The dictionary remains active for decompression of future frames using same DCtx.
 @result : 0, or an error code (which can be tested with ZSTD_isError()).
@ -1115,8 +1074,8 @@ size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size
 
 </p></pre><BR>

-<pre><b>size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);   </b>/* not implemented */<b>
-size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictMode_e dictMode);   </b>/* not implemented */<b>
+<pre><b>size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
 </b><p>  Reference a prefix (single-usage dictionary) for next compression job.
  Prefix is **only used once**. It must be explicitly referenced before each frame.
  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_DDict instead.
@ -1178,7 +1137,7 @@ size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t
 
 </p></pre><BR>

-<a name="Chapter22"></a><h2>Block level API</h2><pre></pre>
+<a name="Chapter21"></a><h2>Block level API</h2><pre></pre>

 <pre><b></b><p>    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
@ -1206,7 +1165,7 @@ size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t
 <h3>Raw zstd block functions</h3><pre></pre><b><pre>size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
 size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
-size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  </b>/**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression */<b>
+size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  </b>/**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */<b>
 </pre></b><BR>
 </html>
 </body>
--- a/sys/contrib/zstd/lib/BUCK
+++ b/sys/contrib/zstd/lib/BUCK
@ -25,6 +25,9 @@ cxx_library(
    name='decompress',
    header_namespace='',
    visibility=['PUBLIC'],
+    headers=subdir_glob([
+        ('decompress', '*_impl.h'),
+    ]),
    srcs=glob(['decompress/zstd*.c']),
    deps=[
        ':common',
@ -80,6 +83,15 @@ cxx_library(
    ]),
 )

+cxx_library(
+    name='cpu',
+    header_namespace='',
+    visibility=['PUBLIC'],
+    exported_headers=subdir_glob([
+        ('common', 'cpu.h'),
+    ]),
+)
+
 cxx_library(
    name='bitstream',
    header_namespace='',
@ -196,6 +208,7 @@ cxx_library(
    deps=[
        ':bitstream',
        ':compiler',
+        ':cpu',
        ':entropy',
        ':errors',
        ':mem',
--- a/sys/contrib/zstd/lib/README.md
+++ b/sys/contrib/zstd/lib/README.md
@ -2,16 +2,19 @@ Zstandard library files
 ================================

 The __lib__ directory is split into several sub-directories,
-in order to make it easier to select or exclude specific features.
+in order to make it easier to select or exclude features.


 #### Building

-`Makefile` script is provided, supporting the standard set of commands,
-directories, and variables (see https://www.gnu.org/prep/standards/html_node/Command-Variables.html).
+`Makefile` script is provided, supporting all standard [Makefile conventions](https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html#Makefile-Conventions),
+including commands variables, staged install, directory variables and standard targets.
 - `make` : generates both static and dynamic libraries
 - `make install` : install libraries in default system directories

+`libzstd` default scope includes compression, decompression, dictionary building,
+and decoding support for legacy formats >= v0.4.0.
+

 #### API

@ -27,29 +30,37 @@ Optional advanced features are exposed via :
 - `ZSTD_STATIC_LINKING_ONLY` : if this macro is defined _before_ including `zstd.h`,
                          it unlocks access to advanced experimental API,
                          exposed in second part of `zstd.h`.
-                          These APIs shall ___never be used with dynamic library___ !
-                          They are not "stable", their definition may change in the future.
+                          These APIs are not "stable", their definition may change in the future.
+                          As a consequence, it shall ___never be used with dynamic library___ !
                          Only static linking is allowed.


 #### Modular build

+It's possible to compile only a limited set of features.
+
 - Directory `lib/common` is always required, for all variants.
 - Compression source code lies in `lib/compress`
 - Decompression source code lies in `lib/decompress`
 - It's possible to include only `compress` or only `decompress`, they don't depend on each other.
 - `lib/dictBuilder` : makes it possible to generate dictionaries from a set of samples.
-    The API is exposed in `lib/dictBuilder/zdict.h`.
-    This module depends on both `lib/common` and `lib/compress` .
- `lib/legacy` : source code to decompress older zstd formats, starting from `v0.1`.
-              This module depends on `lib/common` and `lib/decompress`.
-              To enable this feature, it's necessary to define `ZSTD_LEGACY_SUPPORT = 1` during compilation.
-              Typically, with `gcc`, add argument `-DZSTD_LEGACY_SUPPORT=1`.
-              Using higher number limits the number of version supported.
-              For example, `ZSTD_LEGACY_SUPPORT=2` means : "support legacy formats starting from v0.2+".
-              The API is exposed in `lib/legacy/zstd_legacy.h`.
-              Each version also provides a (dedicated) set of advanced API.
-              For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
+        The API is exposed in `lib/dictBuilder/zdict.h`.
+        This module depends on both `lib/common` and `lib/compress` .
+- `lib/legacy` : source code to decompress legacy zstd formats, starting from `v0.1.0`.
+        This module depends on `lib/common` and `lib/decompress`.
+        To enable this feature, it's required to define `ZSTD_LEGACY_SUPPORT` during compilation.
+        Typically, with `gcc`, add argument `-DZSTD_LEGACY_SUPPORT=1`.
+        Using higher number limits versions supported.
+        For example, `ZSTD_LEGACY_SUPPORT=2` means : "support legacy formats >= v0.2.0".
+        `ZSTD_LEGACY_SUPPORT=3` means : "support legacy formats >= v0.3.0", and so on.
+        Starting v0.8.0, all versions of `zstd` produce frames compliant with specification.
+        As a consequence, `ZSTD_LEGACY_SUPPORT=8` (or more) doesn't trigger legacy support.
+        Also, `ZSTD_LEGACY_SUPPORT=0` means "do __not__ support legacy formats".
+        Once enabled, this capability is transparently triggered within decompression functions.
+        It's also possible to invoke directly legacy API, as exposed in `lib/legacy/zstd_legacy.h`.
+        Each version also provides an additional dedicated set of advanced API.
+        For example, advanced API for version `v0.4` is exposed in `lib/legacy/zstd_v04.h` .
+        Note : `lib/legacy` only supports _decoding_ legacy formats.


 #### Multithreading support
@ -57,19 +68,16 @@ Optional advanced features are exposed via :
 Multithreading is disabled by default when building with `make`.
 Enabling multithreading requires 2 conditions :
 - set macro `ZSTD_MULTITHREAD`
- on POSIX systems : compile with pthread (`-pthread` compilation flag for `gcc` for example)
+- on POSIX systems : compile with pthread (`-pthread` compilation flag for `gcc`)

 Both conditions are automatically triggered by invoking `make lib-mt` target.
 Note that, when linking a POSIX program with a multithreaded version of `libzstd`,
 it's necessary to trigger `-pthread` flag during link stage.

-Multithreading capabilities are exposed via :
- private API `lib/compress/zstdmt_compress.h`.
-  Symbols defined in this header are currently exposed in `libzstd`, hence usable.
-  Note however that this API is planned to be locked and remain strictly internal in the future.
- advanced API `ZSTD_compress_generic()`, defined in `lib/zstd.h`, experimental section.
-  This API is still considered experimental, but is designed to be labelled "stable" at some point in the future.
-  It's the recommended entry point for multi-threading operations.
+Multithreading capabilities are exposed
+via [advanced API `ZSTD_compress_generic()` defined in `lib/zstd.h`](https://github.com/facebook/zstd/blob/dev/lib/zstd.h#L919).
+This API is still considered experimental,
+but is expected to become "stable" at some point in the future.


 #### Windows : using MinGW+MSYS to create DLL
@ -92,7 +100,6 @@ The compiled executable will require ZSTD DLL which is available at `dll\libzstd

 Obsolete API on their way out are stored in directory `lib/deprecated`.
 At this stage, it contains older streaming prototypes, in `lib/deprecated/zbuff.h`.
-Presence in this directory is temporary.
 These prototypes will be removed in some future version.
 Consider migrating code towards supported streaming API exposed in `zstd.h`.

--- a/sys/contrib/zstd/lib/common/bitstream.h
+++ b/sys/contrib/zstd/lib/common/bitstream.h
@ -426,7 +426,7 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
 *  Refill `bitD` from buffer previously set in BIT_initDStream() .
 *  This function is safe, it guarantees it will not read beyond src buffer.
 * @return : status of `BIT_DStream_t` internal register.
- *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
--- a/sys/contrib/zstd/lib/common/compiler.h
+++ b/sys/contrib/zstd/lib/common/compiler.h
@ -63,6 +63,31 @@
 #  endif
 #endif

+/* target attribute */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers. */
+#endif
+#if defined(__GNUC__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if (defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) \
+      && (defined(__x86_64__) || defined(_M_X86)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
 /* prefetch */
 #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
--- a/sys/contrib/zstd/lib/common/cpu.h
+++ b/sys/contrib/zstd/lib/common/cpu.h
@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include <string.h>
+
+#include "mem.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#ifdef _MSC_VER
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1)
+          :);
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\r"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
--- a/sys/contrib/zstd/lib/common/error_private.c
+++ b/sys/contrib/zstd/lib/common/error_private.c
@ -29,6 +29,7 @@ const char* ERR_getErrorString(ERR_enum code)
    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
    case PREFIX(init_missing): return "Context should be init first";
    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
--- a/sys/contrib/zstd/lib/common/fse.h
+++ b/sys/contrib/zstd/lib/common/fse.h
@ -345,7 +345,7 @@ size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* s
 */
 size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace);

-/*! FSE_count_simple
+/*! FSE_count_simple() :
 * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
 * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
 */
--- a/sys/contrib/zstd/lib/common/fse_decompress.c
+++ b/sys/contrib/zstd/lib/common/fse_decompress.c
@ -139,8 +139,8 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
    {   U32 u;
        for (u=0; u<tableSize; u++) {
            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
-            U16 nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
    }   }

--- a/sys/contrib/zstd/lib/common/huf.h
+++ b/sys/contrib/zstd/lib/common/huf.h
@ -58,32 +58,32 @@ extern "C" {
 #endif


-/* *** simple functions *** */
-/**
-HUF_compress() :
-    Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
-    'dst' buffer must be already allocated.
-    Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
-    `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
-    @return : size of compressed data (<= `dstCapacity`).
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
-                     if return == 1, srcData is a single repeated byte symbol (RLE compression).
-                     if HUF_isError(return), compression failed (more details using HUF_getErrorName())
-*/
+/* ========================== */
+/* ***  simple functions  *** */
+/* ========================== */
+
+/** HUF_compress() :
+ *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
 HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
                             const void* src, size_t srcSize);

-/**
-HUF_decompress() :
-    Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
-    into already allocated buffer 'dst', of minimum size 'dstSize'.
-    `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
-    Note : in contrast with FSE, HUF_decompress can regenerate
-           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
-           because it knows size to regenerate.
-    @return : size of regenerated data (== originalSize),
-              or an error code, which can be tested using HUF_isError()
-*/
+/** HUF_decompress() :
+ *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ *  Note : in contrast with FSE, HUF_decompress can regenerate
+ *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ *         because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ *           or an error code, which can be tested using HUF_isError()
+ */
 HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
                               const void* cSrc, size_t cSrcSize);

@ -100,39 +100,32 @@ HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error c
 /* ***   Advanced function   *** */

 /** HUF_compress2() :
- *  Same as HUF_compress(), but offers direct control over `maxSymbolValue` and `tableLog`.
- *  `tableLog` must be `<= HUF_TABLELOG_MAX` . */
-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+ *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned maxSymbolValue, unsigned tableLog);

 /** HUF_compress4X_wksp() :
 *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
- *  `workspace` must have minimum alignment of 4, and be at least as large as following macro */
+ * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
 #define HUF_WORKSPACE_SIZE (6 << 10)
 #define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
-
-/**
- *  The minimum workspace size for the `workSpace` used in
- *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
- *
- *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
- *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
- *  Buffer overflow errors may potentially occur if code modifications result in
- *  a required workspace size greater than that specified in the following
- *  macro.
- */
-#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
-#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     unsigned maxSymbolValue, unsigned tableLog,
+                                     void* workSpace, size_t wkspSize);

 #endif   /* HUF_H_298734234 */

 /* ******************************************************************
 *  WARNING !!
 *  The following section contains advanced and experimental definitions
- *  which shall never be used in the context of dll
+ *  which shall never be used in the context of a dynamic library,
 *  because they are not guaranteed to remain stable in the future.
 *  Only consider them in association with static linking.
- *******************************************************************/
+ * *****************************************************************/
 #if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
 #define HUF_H_HUF_STATIC_LINKING_ONLY

@ -141,11 +134,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const


 /* *** Constants *** */
-#define HUF_TABLELOG_MAX      12       /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
-#define HUF_TABLELOG_DEFAULT  11       /* tableLog by default, when not specified */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
 #define HUF_SYMBOLVALUE_MAX  255

-#define HUF_TABLELOG_ABSOLUTEMAX  15   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#define HUF_TABLELOG_ABSOLUTEMAX  15  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
 #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
 #  error "HUF_TABLELOG_MAX is too large !"
 #endif
@ -192,24 +185,23 @@ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,


 /* ****************************************
-*  HUF detailed API
-******************************************/
-/*!
-HUF_compress() does the following:
-1. count symbol occurrence from source[] into table count[] using FSE_count()
-2. (optional) refine tableLog using HUF_optimalTableLog()
-3. build Huffman table from count using HUF_buildCTable()
-4. save Huffman table to memory buffer using HUF_writeCTable()
-5. encode the data stream using HUF_compress4X_usingCTable()
+ *  HUF detailed API
+ * ****************************************/

-The following API allows targeting specific sub-functions for advanced tasks.
-For example, it's possible to compress several blocks using the same 'CTable',
-or to save and regenerate 'CTable' using external methods.
-*/
-/* FSE_count() : find it within "fse.h" */
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
 unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
 typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);

@ -219,46 +211,65 @@ typedef enum {
   HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
 } HUF_repeat;
 /** HUF_compress4X_repeat() :
-*   Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
-*   If it uses hufTable it does not modify hufTable or repeat.
-*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
-*   If preferRepeat then the old table will always be used if valid. */
-size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);

 /** HUF_buildCTable_wksp() :
 *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
- *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
 */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
 size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize);

 /*! HUF_readStats() :
-    Read compact Huffman tree, saved by HUF_writeCTable().
-    `huffWeight` is destination buffer.
-    @return : size read from `src` , or an error Code .
-    Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
-size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
-                     U32* nbSymbolsPtr, U32* tableLogPtr,
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                     const void* src, size_t srcSize);

 /** HUF_readCTable() :
-*   Loading a CTable saved with HUF_writeCTable() */
+ *  Loading a CTable saved with HUF_writeCTable() */
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);


 /*
-HUF_decompress() does the following:
-1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
-2. build Huffman table from save, using HUF_readDTableXn()
-3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
-*/
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */

 /** HUF_selectDecoder() :
-*   Tells which decoder is likely to decode faster,
-*   based on a set of pre-determined metrics.
-*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
-*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ *  Assumption : 0 < dstSize <= 128 KB */
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);

+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
 size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
 size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
 size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
@ -269,17 +280,23 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* c
 size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);


+/* ====================== */
 /* single stream variants */
+/* ====================== */

 size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
 size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
 size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
 /** HUF_compress1X_repeat() :
-*   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
-*   If it uses hufTable it does not modify hufTable or repeat.
-*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
-*   If preferRepeat then the old table will always be used if valid. */
-size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);

 size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
 size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
@ -295,6 +312,14 @@ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cS
 size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);

+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+
 #endif /* HUF_STATIC_LINKING_ONLY */

 #if defined (__cplusplus)
--- a/sys/contrib/zstd/lib/common/pool.c
+++ b/sys/contrib/zstd/lib/common/pool.c
@ -12,6 +12,7 @@
 /* ======   Dependencies   ======= */
 #include <stddef.h>  /* size_t */
 #include "pool.h"
+#include "zstd_internal.h"  /* ZSTD_malloc, ZSTD_free */

 /* ======   Compiler specifics   ====== */
 #if defined(_MSC_VER)
@ -193,32 +194,54 @@ static int isQueueFull(POOL_ctx const* ctx) {
    }
 }

-void POOL_add(void* ctxVoid, POOL_function function, void *opaque) {
-    POOL_ctx* const ctx = (POOL_ctx*)ctxVoid;
-    if (!ctx) { return; }

-    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
-    {   POOL_job const job = {function, opaque};
+static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job const job = {function, opaque};
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;

-        /* Wait until there is space in the queue for the new job */
-        while (isQueueFull(ctx) && !ctx->shutdown) {
-          ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
-        }
-        /* The queue is still going => there is space */
-        if (!ctx->shutdown) {
-            ctx->queueEmpty = 0;
-            ctx->queue[ctx->queueTail] = job;
-            ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
-        }
-    }
-    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
 }

-#else  /* ZSTD_MULTITHREAD  not defined */
-/* No multi-threading support */
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}

-/* We don't need any data, but if it is empty malloc() might return NULL. */
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
 struct POOL_ctx_s {
    int dummy;
 };
@ -240,11 +263,17 @@ void POOL_free(POOL_ctx* ctx) {
    (void)ctx;
 }

-void POOL_add(void* ctx, POOL_function function, void* opaque) {
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
    (void)ctx;
    function(opaque);
 }

+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
 size_t POOL_sizeof(POOL_ctx* ctx) {
    if (ctx==NULL) return 0;  /* supports sizeof NULL */
    assert(ctx == &g_ctx);
--- a/sys/contrib/zstd/lib/common/pool.h
+++ b/sys/contrib/zstd/lib/common/pool.h
@ -17,7 +17,8 @@ extern "C" {


 #include <stddef.h>   /* size_t */
-#include "zstd_internal.h"   /* ZSTD_customMem */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+#include "zstd.h"

 typedef struct POOL_ctx_s POOL_ctx;

@ -27,35 +28,43 @@ typedef struct POOL_ctx_s POOL_ctx;
 *  The maximum number of queued jobs before blocking is `queueSize`.
 * @return : POOL_ctx pointer on success, else NULL.
 */
-POOL_ctx *POOL_create(size_t numThreads, size_t queueSize);
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);

-POOL_ctx *POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem);
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem);

 /*! POOL_free() :
    Free a thread pool returned by POOL_create().
 */
-void POOL_free(POOL_ctx *ctx);
+void POOL_free(POOL_ctx* ctx);

 /*! POOL_sizeof() :
    return memory usage of pool returned by POOL_create().
 */
-size_t POOL_sizeof(POOL_ctx *ctx);
+size_t POOL_sizeof(POOL_ctx* ctx);

 /*! POOL_function :
    The function type that can be added to a thread pool.
 */
-typedef void (*POOL_function)(void *);
+typedef void (*POOL_function)(void*);
 /*! POOL_add_function :
    The function type for a generic thread pool add function.
 */
-typedef void (*POOL_add_function)(void *, POOL_function, void *);
+typedef void (*POOL_add_function)(void*, POOL_function, void*);

 /*! POOL_add() :
-    Add the job `function(opaque)` to the thread pool.
+    Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
    Possibly blocks until there is room in the queue.
    Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
 */
-void POOL_add(void *ctx, POOL_function function, void *opaque);
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+    Add the job `function(opaque)` to the thread pool if a worker is available.
+    return immediately otherwise.
+   @return : 1 if successful, 0 if not.
+*/
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);


 #if defined (__cplusplus)
--- a/sys/contrib/zstd/lib/common/threading.h
+++ b/sys/contrib/zstd/lib/common/threading.h
@ -45,15 +45,15 @@ extern "C" {

 /* mutex */
 #define ZSTD_pthread_mutex_t           CRITICAL_SECTION
-#define ZSTD_pthread_mutex_init(a, b)  (InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
 #define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
 #define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
 #define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))

 /* condition variable */
 #define ZSTD_pthread_cond_t             CONDITION_VARIABLE
-#define ZSTD_pthread_cond_init(a, b)    (InitializeConditionVariable((a)), 0)
-#define ZSTD_pthread_cond_destroy(a)    /* No delete */
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
 #define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
 #define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
 #define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
@ -100,17 +100,17 @@ int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
 /* No multithreading support */

 typedef int ZSTD_pthread_mutex_t;
-#define ZSTD_pthread_mutex_init(a, b)   ((void)a, 0)
-#define ZSTD_pthread_mutex_destroy(a)
-#define ZSTD_pthread_mutex_lock(a)
-#define ZSTD_pthread_mutex_unlock(a)
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))

 typedef int ZSTD_pthread_cond_t;
-#define ZSTD_pthread_cond_init(a, b)    ((void)a, 0)
-#define ZSTD_pthread_cond_destroy(a)
-#define ZSTD_pthread_cond_wait(a, b)
-#define ZSTD_pthread_cond_signal(a)
-#define ZSTD_pthread_cond_broadcast(a)
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))

 /* do not use ZSTD_pthread_t */

--- a/sys/contrib/zstd/lib/common/zstd_errors.h
+++ b/sys/contrib/zstd/lib/common/zstd_errors.h
@ -35,12 +35,20 @@ extern "C" {
 #  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
 #endif

-/*-****************************************
- *  error codes list
- *  note : this API is still considered unstable
- *         and shall not be used with a dynamic library.
- *         only static linking is allowed
- ******************************************/
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
 typedef enum {
  ZSTD_error_no_error = 0,
  ZSTD_error_GENERIC  = 1,
@ -61,9 +69,10 @@ typedef enum {
  ZSTD_error_stage_wrong       = 60,
  ZSTD_error_init_missing      = 62,
  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
  ZSTD_error_dstSize_tooSmall = 70,
  ZSTD_error_srcSize_wrong    = 72,
-  /* following error codes are not stable and may be removed or changed in a future version */
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
  ZSTD_error_frameIndex_tooLarge = 100,
  ZSTD_error_seekableIO          = 102,
  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
--- a/sys/contrib/zstd/lib/common/zstd_internal.h
+++ b/sys/contrib/zstd/lib/common/zstd_internal.h
@ -132,14 +132,15 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy

 #define Litbits  8
 #define MaxLit ((1<<Litbits) - 1)
-#define MaxML  52
-#define MaxLL  35
+#define MaxML   52
+#define MaxLL   35
 #define DefaultMaxOff 28
-#define MaxOff 31
+#define MaxOff  31
 #define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
 #define MLFSELog    9
 #define LLFSELog    9
 #define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)

 static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 0, 0, 0, 0, 0,
@ -228,8 +229,6 @@ typedef struct {
    BYTE* ofCode;
    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
    U32   longLengthPos;
-    U32   rep[ZSTD_REP_NUM];
-    U32   repToConfirm[ZSTD_REP_NUM];
 } seqStore_t;

 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
--- a/sys/contrib/zstd/lib/compress/fse_compress.c
+++ b/sys/contrib/zstd/lib/compress/fse_compress.c
@ -248,7 +248,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
            bitCount  -= (count<max);
            previous0  = (count==1);
            if (remaining<1) return ERROR(GENERIC);
-            while (remaining<threshold) nbBits--, threshold>>=1;
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
        }
        if (bitCount>16) {
            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
@ -292,7 +292,7 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized
    It doesn't use any additional memory.
    But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
    For this reason, prefer using a table `count` with 256 elements.
-    @return : count of most numerous element
+    @return : count of most numerous element.
 */
 size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
                        const void* src, size_t srcSize)
@ -305,7 +305,10 @@ size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }

-    while (ip<end) count[*ip++]++;
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }

    while (!count[maxSymbolValue]) maxSymbolValue--;
    *maxSymbolValuePtr = maxSymbolValue;
@ -318,7 +321,8 @@ size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,

 /* FSE_count_parallel_wksp() :
 * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
- * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`.
+ * @return : largest histogram frequency, or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
 static size_t FSE_count_parallel_wksp(
                                unsigned* count, unsigned* maxSymbolValuePtr,
                                const void* source, size_t sourceSize,
@ -333,7 +337,7 @@ static size_t FSE_count_parallel_wksp(
    U32* const Counting3 = Counting2 + 256;
    U32* const Counting4 = Counting3 + 256;

-    memset(Counting1, 0, 4*256*sizeof(unsigned));
+    memset(workSpace, 0, 4*256*sizeof(unsigned));

    /* safety checks */
    if (!sourceSize) {
@ -379,7 +383,9 @@ static size_t FSE_count_parallel_wksp(
            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
    }   }

-    {   U32 s; for (s=0; s<=maxSymbolValue; s++) {
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
            if (count[s] > max) max = count[s];
    }   }
@ -393,9 +399,11 @@ static size_t FSE_count_parallel_wksp(
 * Same as FSE_countFast(), but using an externally provided scratch buffer.
 * `workSpace` size must be table of >= `1024` unsigned */
 size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
-                     const void* source, size_t sourceSize, unsigned* workSpace)
+                          const void* source, size_t sourceSize,
+                          unsigned* workSpace)
 {
-    if (sourceSize < 1500) return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    if (sourceSize < 1500) /* heuristic threshold */
+        return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
    return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
 }

@ -540,7 +548,7 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
           find max, then give all remaining points to max */
        U32 maxV = 0, maxC = 0;
        for (s=0; s<=maxSymbolValue; s++)
-            if (count[s] > maxC) maxV=s, maxC=count[s];
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
        norm[maxV] += (short)ToDistribute;
        return 0;
    }
@ -548,7 +556,7 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count,
    if (total == 0) {
        /* all of the symbols were low enough for the lowOne or lowThreshold */
        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
-            if (norm[s] > 0) ToDistribute--, norm[s]++;
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
        return 0;
    }

@ -604,7 +612,7 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
                    U64 restToBeat = vStep * rtbTable[proba];
                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
                }
-                if (proba > largestP) largestP=proba, largest=s;
+                if (proba > largestP) { largestP=proba; largest=s; }
                normalizedCounter[s] = proba;
                stillToDistribute -= proba;
        }   }
--- a/sys/contrib/zstd/lib/compress/huf_compress.c
+++ b/sys/contrib/zstd/lib/compress/huf_compress.c
@ -46,6 +46,7 @@
 #include <string.h>     /* memcpy, memset */
 #include <stdio.h>      /* printf (debug) */
 #include "bitstream.h"
+#include "compiler.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "fse.h"        /* header compression */
 #define HUF_STATIC_LINKING_ONLY
@ -322,7 +323,10 @@ static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
        U32 const c = count[n];
        U32 const r = BIT_highbit32(c+1) + 1;
        U32 pos = rank[r].current++;
-        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) huffNode[pos]=huffNode[pos-1], pos--;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) {
+            huffNode[pos] = huffNode[pos-1];
+            pos--;
+        }
        huffNode[pos].count = c;
        huffNode[pos].byte  = (BYTE)n;
    }
@ -331,10 +335,10 @@ static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)

 /** HUF_buildCTable_wksp() :
 *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
- *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned.
 */
 #define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
-typedef nodeElt huffNodeTable[2*HUF_SYMBOLVALUE_MAX+1 +1];
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
 size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
 {
    nodeElt* const huffNode0 = (nodeElt*)workSpace;
@ -345,9 +349,10 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu
    U32 nodeRoot;

    /* safety checks */
-    if (wkspSize < sizeof(huffNodeTable)) return ERROR(GENERIC);   /* workSpace is not large enough */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(huffNodeTable)) return ERROR(workSpace_tooSmall);
    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
-    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(GENERIC);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
    memset(huffNode0, 0, sizeof(huffNodeTable));

    /* sort, decreasing order */
@ -405,6 +410,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu
 }

 /** HUF_buildCTable() :
+ * @return : maxNbBits
 *  Note : count is used before tree is written, so they can safely overlap
 */
 size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
@ -432,13 +438,14 @@ static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, uns
  return !bad;
 }

-static void HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
 {
    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
 }

-size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
-
 #define HUF_FLUSHBITS(s)  BIT_flushBits(s)

 #define HUF_FLUSHBITS_1(stream) \
@ -447,7 +454,10 @@ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
 #define HUF_FLUSHBITS_2(stream) \
    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)

-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
 {
    const BYTE* ip = (const BYTE*) src;
    BYTE* const ostart = (BYTE*)dst;
@ -491,8 +501,58 @@ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, si
    return BIT_closeCStream(&bitC);
 }

+#if DYNAMIC_BMI2

-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+static TARGET_ATTRIBUTE("bmi2") size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    if (bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    (void)bmi2;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int bmi2)
 {
    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
    const BYTE* ip = (const BYTE*) src;
@ -505,28 +565,31 @@ size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, si
    if (srcSize < 12) return 0;   /* no saving possible : too small input */
    op += 6;   /* jumpTable */

-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
        if (cSize==0) return 0;
+        assert(cSize <= 65535);
        MEM_writeLE16(ostart, (U16)cSize);
        op += cSize;
    }

    ip += segmentSize;
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
        if (cSize==0) return 0;
+        assert(cSize <= 65535);
        MEM_writeLE16(ostart+2, (U16)cSize);
        op += cSize;
    }

    ip += segmentSize;
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) );
        if (cSize==0) return 0;
+        assert(cSize <= 65535);
        MEM_writeLE16(ostart+4, (U16)cSize);
        op += cSize;
    }

    ip += segmentSize;
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, iend-ip, CTable) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) );
        if (cSize==0) return 0;
        op += cSize;
    }
@ -534,15 +597,20 @@ size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, si
    return op-ostart;
 }

+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+

 static size_t HUF_compressCTable_internal(
                BYTE* const ostart, BYTE* op, BYTE* const oend,
                const void* src, size_t srcSize,
-                unsigned singleStream, const HUF_CElt* CTable)
+                unsigned singleStream, const HUF_CElt* CTable, const int bmi2)
 {
    size_t const cSize = singleStream ?
-                         HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) :
-                         HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
+                         HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) :
+                         HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2);
    if (HUF_isError(cSize)) { return cSize; }
    if (cSize==0) { return 0; }   /* uncompressible */
    op += cSize;
@ -551,86 +619,98 @@ static size_t HUF_compressCTable_internal(
    return op-ostart;
 }

+typedef struct {
+    U32 count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+    huffNodeTable nodeTable;
+} HUF_compress_tables_t;

-/* `workSpace` must a table of at least 1024 unsigned */
+/* HUF_compress_internal() :
+ * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
 static size_t HUF_compress_internal (
                void* dst, size_t dstSize,
                const void* src, size_t srcSize,
                unsigned maxSymbolValue, unsigned huffLog,
                unsigned singleStream,
                void* workSpace, size_t wkspSize,
-                HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat)
+                HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+                const int bmi2)
 {
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
    BYTE* const ostart = (BYTE*)dst;
    BYTE* const oend = ostart + dstSize;
    BYTE* op = ostart;

-    U32* count;
-    size_t const countSize = sizeof(U32) * (HUF_SYMBOLVALUE_MAX + 1);
-    HUF_CElt* CTable;
-    size_t const CTableSize = sizeof(HUF_CElt) * (HUF_SYMBOLVALUE_MAX + 1);
-
    /* checks & inits */
-    if (wkspSize < sizeof(huffNodeTable) + countSize + CTableSize) return ERROR(GENERIC);
-    if (!srcSize) return 0;  /* Uncompressed (note : 1 means rle, so first byte must be correct) */
-    if (!dstSize) return 0;  /* cannot fit within dst budget */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;

-    count = (U32*)workSpace;
-    workSpace = (BYTE*)workSpace + countSize;
-    wkspSize -= countSize;
-    CTable = (HUF_CElt*)workSpace;
-    workSpace = (BYTE*)workSpace + CTableSize;
-    wkspSize -= CTableSize;
-
-    /* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */
+    /* Heuristic : If old table is valid, use it for small inputs */
    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
-        return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
    }

    /* Scan input and build symbol stats */
-    {   CHECK_V_F(largest, FSE_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, (U32*)workSpace) );
+    {   CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) );
        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
-        if (largest <= (srcSize >> 7)+1) return 0;   /* Fast heuristic : not compressible enough */
+        if (largest <= (srcSize >> 7)+1) return 0;   /* heuristic : probably not compressible enough */
    }

    /* Check validity of previous table */
-    if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) {
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
        *repeat = HUF_repeat_none;
    }
    /* Heuristic : use existing table for small inputs */
    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
-        return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           singleStream, oldHufTable, bmi2);
    }

    /* Build Huffman Tree */
    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
-    {   CHECK_V_F(maxBits, HUF_buildCTable_wksp (CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize) );
+    {   CHECK_V_F(maxBits, HUF_buildCTable_wksp(table->CTable, table->count,
+                                                maxSymbolValue, huffLog,
+                                                table->nodeTable, sizeof(table->nodeTable)) );
        huffLog = (U32)maxBits;
-        /* Zero the unused symbols so we can check it for validity */
-        memset(CTable + maxSymbolValue + 1, 0, CTableSize - (maxSymbolValue + 1) * sizeof(HUF_CElt));
+        /* Zero unused symbols in CTable, so we can check it for validity */
+        memset(table->CTable + (maxSymbolValue + 1), 0,
+               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
    }

    /* Write table description header */
-    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, CTable, maxSymbolValue, huffLog) );
-        /* Check if using the previous table will be beneficial */
+    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
+        /* Check if using previous huffman table is beneficial */
        if (repeat && *repeat != HUF_repeat_none) {
-            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
-            size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue);
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
-                return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
-            }
-        }
-        /* Use the new table */
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   singleStream, oldHufTable, bmi2);
+        }   }
+
+        /* Use the new huffman table */
        if (hSize + 12ul >= srcSize) { return 0; }
        op += hSize;
        if (repeat) { *repeat = HUF_repeat_none; }
-        if (oldHufTable) { memcpy(oldHufTable, CTable, CTableSize); } /* Save the new table */
+        if (oldHufTable)
+            memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
    }
-    return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable);
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       singleStream, table->CTable, bmi2);
 }


@ -639,52 +719,70 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
                      unsigned maxSymbolValue, unsigned huffLog,
                      void* workSpace, size_t wkspSize)
 {
-    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
 }

 size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
                      const void* src, size_t srcSize,
                      unsigned maxSymbolValue, unsigned huffLog,
                      void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
 {
-    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat, preferRepeat);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 1 /*single stream*/,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, preferRepeat, bmi2);
 }

 size_t HUF_compress1X (void* dst, size_t dstSize,
                 const void* src, size_t srcSize,
                 unsigned maxSymbolValue, unsigned huffLog)
 {
-    unsigned workSpace[1024];
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
 }

+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
 size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
                      const void* src, size_t srcSize,
                      unsigned maxSymbolValue, unsigned huffLog,
                      void* workSpace, size_t wkspSize)
 {
-    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /*4 streams*/,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
 }

+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * re-use an existing huffman compression table */
 size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                      const void* src, size_t srcSize,
                      unsigned maxSymbolValue, unsigned huffLog,
                      void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
 {
-    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat, preferRepeat);
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, 0 /* 4 streams */,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, preferRepeat, bmi2);
 }

 size_t HUF_compress2 (void* dst, size_t dstSize,
                const void* src, size_t srcSize,
                unsigned maxSymbolValue, unsigned huffLog)
 {
-    unsigned workSpace[1024];
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
 }

 size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-    return HUF_compress2(dst, maxDstSize, src, (U32)srcSize, 255, HUF_TABLELOG_DEFAULT);
+    return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
 }
--- a/sys/contrib/zstd/lib/compress/zstd_compress.c
+++ b/sys/contrib/zstd/lib/compress/zstd_compress.c
--- a/sys/contrib/zstd/lib/compress/zstd_compress_internal.h
+++ b/sys/contrib/zstd/lib/compress/zstd_compress_internal.h
@ -30,8 +30,14 @@ extern "C" {
 /*-*************************************
 *  Constants
 ***************************************/
-static const U32 g_searchStrength = 8;
-#define HASH_READ_SIZE 8
+#define kSearchStrength      8
+#define HASH_READ_SIZE       8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index 1 now means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionnally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */


 /*-*************************************
@ -43,7 +49,7 @@ typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
 typedef struct ZSTD_prefixDict_s {
    const void* dict;
    size_t dictSize;
-    ZSTD_dictMode_e dictMode;
+    ZSTD_dictContentType_e dictContentType;
 } ZSTD_prefixDict;

 typedef struct {
@ -51,7 +57,6 @@ typedef struct {
    FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
    FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-    U32 workspace[HUF_WORKSPACE_SIZE_U32];
    HUF_repeat hufCTable_repeatMode;
    FSE_repeat offcode_repeatMode;
    FSE_repeat matchlength_repeatMode;
@ -93,12 +98,44 @@ typedef struct {
    U32  staticPrices;           /* prices follow a pre-defined cost structure, statistics are irrelevant */
 } optState_t;

+typedef struct {
+  ZSTD_entropyCTables_t entropy;
+  U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+    BYTE const* nextSrc;    /* next block here to continue on current prefix */
+    BYTE const* base;       /* All regular indexes relative to this position */
+    BYTE const* dictBase;   /* extDict indexes relative to this position */
+    U32 dictLimit;          /* below that point, need extDict */
+    U32 lowLimit;           /* below that point, no more data */
+} ZSTD_window_t;
+
+typedef struct {
+    ZSTD_window_t window;      /* State for window round buffer management */
+    U32 loadedDictEnd;         /* index of end of dictionary */
+    U32 nextToUpdate;          /* index from which to continue table update */
+    U32 nextToUpdate3;         /* index from which to continue table update */
+    U32 hashLog3;              /* dispatch table : larger == faster, more memory */
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+    optState_t opt;         /* optimal parser state */
+} ZSTD_matchState_t;
+
+typedef struct {
+    ZSTD_compressedBlockState_t* prevCBlock;
+    ZSTD_compressedBlockState_t* nextCBlock;
+    ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
 typedef struct {
    U32 offset;
    U32 checksum;
 } ldmEntry_t;

 typedef struct {
+    ZSTD_window_t window;   /* State for the window round buffer management */
    ldmEntry_t* hashTable;
    BYTE* bucketOffsets;    /* Next position in bucket to insert entry */
    U64 hashPower;          /* Used to compute the rolling hash.
@ -111,60 +148,68 @@ typedef struct {
    U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
    U32 minMatchLength;     /* Minimum match length */
    U32 hashEveryLog;       /* Log number of entries to skip */
+    U32 windowLog;          /* Window log for the LDM */
 } ldmParams_t;

+typedef struct {
+    U32 offset;
+    U32 litLength;
+    U32 matchLength;
+} rawSeq;
+
+typedef struct {
+  rawSeq* seq;     /* The start of the sequences */
+  size_t pos;      /* The position where reading stopped. <= size. */
+  size_t size;     /* The number of sequences. <= capacity. */
+  size_t capacity; /* The capacity of the `seq` pointer */
+} rawSeqStore_t;
+
 struct ZSTD_CCtx_params_s {
    ZSTD_format_e format;
    ZSTD_compressionParameters cParams;
    ZSTD_frameParameters fParams;

    int compressionLevel;
-    U32 forceWindow;           /* force back-references to respect limit of
+    int disableLiteralCompression;
+    int forceWindow;           /* force back-references to respect limit of
                                * 1<<wLog, even for dictionary */

    /* Multithreading: used to pass parameters to mtctx */
-    U32 nbThreads;
+    unsigned nbWorkers;
    unsigned jobSize;
    unsigned overlapSizeLog;

    /* Long distance matching parameters */
    ldmParams_t ldmParams;

-    /* For use with createCCtxParams() and freeCCtxParams() only */
+    /* Internal use, for createCCtxParams() and freeCCtxParams() only */
    ZSTD_customMem customMem;
-
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */

 struct ZSTD_CCtx_s {
-    const BYTE* nextSrc;    /* next block here to continue on current prefix */
-    const BYTE* base;       /* All regular indexes relative to this position */
-    const BYTE* dictBase;   /* extDict indexes relative to this position */
-    U32   dictLimit;        /* below that point, need extDict */
-    U32   lowLimit;         /* below that point, no more data */
-    U32   nextToUpdate;     /* index from which to continue dictionary update */
-    U32   nextToUpdate3;    /* index from which to continue dictionary update */
-    U32   hashLog3;         /* dispatch table : larger == faster, more memory */
-    U32   loadedDictEnd;    /* index of end of dictionary */
    ZSTD_compressionStage_e stage;
-    U32   dictID;
+    int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+    int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
    ZSTD_CCtx_params requestedParams;
    ZSTD_CCtx_params appliedParams;
+    U32   dictID;
    void* workSpace;
    size_t workSpaceSize;
    size_t blockSize;
-    U64 pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
-    U64 consumedSrcSize;
+    unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+    unsigned long long consumedSrcSize;
+    unsigned long long producedCSize;
    XXH64_state_t xxhState;
    ZSTD_customMem customMem;
    size_t staticSize;

-    seqStore_t seqStore;    /* sequences storage ptrs */
-    optState_t optState;
-    ldmState_t ldmState;    /* long distance matching state */
-    U32* hashTable;
-    U32* hashTable3;
-    U32* chainTable;
-    ZSTD_entropyCTables_t* entropy;
+    seqStore_t seqStore;      /* sequences storage ptrs */
+    ldmState_t ldmState;      /* long distance matching state */
+    rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+    size_t maxNbLdmSequences;
+    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+    ZSTD_blockState_t blockState;
+    U32* entropyWorkspace;  /* entropy workspace of HUF_WORKSPACE_SIZE bytes */

    /* streaming */
    char*  inBuff;
@ -191,6 +236,12 @@ struct ZSTD_CCtx_s {
 };


+typedef size_t (*ZSTD_blockCompressor) (
+        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict);
+
+
 MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
 {
    static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
@ -359,10 +410,12 @@ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* co
 }

 /** ZSTD_count_2segments() :
-*   can count match length with `ip` & `match` in 2 different segments.
-*   convention : on reaching mEnd, match count continue starting from iStart
-*/
-MEM_STATIC size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+ *  can count match length with `ip` & `match` in 2 different segments.
+ *  convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
 {
    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
    size_t const matchLength = ZSTD_count(ip, match, vEnd);
@ -372,8 +425,8 @@ MEM_STATIC size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const


 /*-*************************************
-*  Hashes
-***************************************/
+ *  Hashes
+ ***************************************/
 static const U32 prime3bytes = 506832829U;
 static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
 MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
@ -411,6 +464,171 @@ MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
    }
 }

+/*-*************************************
+*  Round buffer management
+***************************************/
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX                                                     \
+    ( ((U32)-1)                  /* Maximum ending current index */            \
+    - ZSTD_CURRENT_MAX)          /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+    size_t const endT = (size_t)(window->nextSrc - window->base);
+    U32 const end = (U32)endT;
+
+    window->lowLimit = end;
+    window->dictLimit = end;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+    return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+                                                  void const* srcEnd)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window.base);
+    return current > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ * NOTE: (maxDist & cycleMask) must be zero.
+ */
+MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                           U32 maxDist, void const* src)
+{
+    /* preemptive overflow correction:
+     * 1. correction is large enough:
+     *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+     *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+     *
+     *    current - newCurrent
+     *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+     *    > (3<<29) - (1<<chainLog)
+     *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
+     *    > 1<<29
+     *
+     * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+     *    After correction, current is less than (1<<chainLog + 1<<windowLog).
+     *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+     *    In 32-bit mode we are safe, because (chainLog <= 29), so
+     *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+     * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+     *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+     */
+    U32 const cycleMask = (1U << cycleLog) - 1;
+    U32 const current = (U32)((BYTE const*)src - window->base);
+    U32 const newCurrent = (current & cycleMask) + maxDist;
+    U32 const correction = current - newCurrent;
+    assert((maxDist & cycleMask) == 0);
+    assert(current > newCurrent);
+    /* Loose bound, should be around 1<<29 (see above) */
+    assert(correction > 1<<28);
+
+    window->base += correction;
+    window->dictBase += correction;
+    window->lowLimit -= correction;
+    window->dictLimit -= correction;
+
+    DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+             window->lowLimit);
+    return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ * This allows a simple check that index >= lowLimit to see if index is valid.
+ * This must be called before a block compression call, with srcEnd as the block
+ * source end.
+ * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit.
+ * This is because dictionaries are allowed to be referenced as long as the last
+ * byte of the dictionary is in the window, but once they are out of range,
+ * they cannot be referenced. If loadedDictEndPtr is NULL, we use
+ * loadedDictEnd == 0.
+ */
+MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                                           void const* srcEnd, U32 maxDist,
+                                           U32* loadedDictEndPtr)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window->base);
+    U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0;
+    if (current > maxDist + loadedDictEnd) {
+        U32 const newLowLimit = current - maxDist;
+        if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+        if (window->dictLimit < window->lowLimit) {
+            DEBUGLOG(5, "Update dictLimit from %u to %u", window->dictLimit,
+                     window->lowLimit);
+            window->dictLimit = window->lowLimit;
+        }
+        if (loadedDictEndPtr)
+            *loadedDictEndPtr = 0;
+    }
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+                                  void const* src, size_t srcSize)
+{
+    BYTE const* const ip = (BYTE const*)src;
+    U32 contiguous = 1;
+    /* Check if blocks follow each other */
+    if (src != window->nextSrc) {
+        /* not contiguous */
+        size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u",
+                 window->dictLimit);
+        window->lowLimit = window->dictLimit;
+        assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
+        window->dictLimit = (U32)distanceFromBase;
+        window->dictBase = window->base;
+        window->base = ip - distanceFromBase;
+        // ms->nextToUpdate = window->dictLimit;
+        if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit;   /* too small extDict */
+        contiguous = 0;
+    }
+    window->nextSrc = ip + srcSize;
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ( (ip+srcSize > window->dictBase + window->lowLimit)
+       & (ip < window->dictBase + window->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+        window->lowLimit = lowLimitMax;
+    }
+    return contiguous;
+}
+
 #if defined (__cplusplus)
 }
 #endif
@ -421,6 +639,13 @@ MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
 * These prototypes shall only be called from within lib/compress
 * ============================================================== */

+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints, 
+ * LDM and manually set compression parameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize);
+
 /*! ZSTD_initCStream_internal() :
 *  Private use only. Init streaming operation.
 *  expects params to be valid.
@ -446,7 +671,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
 * Private use only. To be called from zstdmt_compress.c. */
 size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
                                    const void* dict, size_t dictSize,
-                                    ZSTD_dictMode_e dictMode,
+                                    ZSTD_dictContentType_e dictContentType,
                                    const ZSTD_CDict* cdict,
                                    ZSTD_CCtx_params params,
                                    unsigned long long pledgedSrcSize);
@ -459,4 +684,26 @@ size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
                                 const void* dict,size_t dictSize,
                                 ZSTD_CCtx_params params);

+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapcity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * @return : An error code on failure.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+
 #endif /* ZSTD_COMPRESS_H */
--- a/sys/contrib/zstd/lib/compress/zstd_double_fast.c
+++ b/sys/contrib/zstd/lib/compress/zstd_double_fast.c
@ -12,44 +12,58 @@
 #include "zstd_double_fast.h"


-void ZSTD_fillDoubleHashTable(ZSTD_CCtx* cctx, const void* end, const U32 mls)
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              ZSTD_compressionParameters const* cParams,
+                              void const* end)
 {
-    U32* const hashLarge = cctx->hashTable;
-    U32  const hBitsL = cctx->appliedParams.cParams.hashLog;
-    U32* const hashSmall = cctx->chainTable;
-    U32  const hBitsS = cctx->appliedParams.cParams.chainLog;
-    const BYTE* const base = cctx->base;
-    const BYTE* ip = base + cctx->nextToUpdate;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->searchLength;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
-    const size_t fastHashFillStep = 3;
+    const U32 fastHashFillStep = 3;

-    while(ip <= iend) {
-        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
-        hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
-        ip += fastHashFillStep;
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = current + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = current + i;
+        }
    }
 }


 FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
-                                 const void* src, size_t srcSize,
-                                 const U32 mls)
+size_t ZSTD_compressBlock_doubleFast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        U32 const mls /* template */)
 {
-    U32* const hashLong = cctx->hashTable;
-    const U32 hBitsL = cctx->appliedParams.cParams.hashLog;
-    U32* const hashSmall = cctx->chainTable;
-    const U32 hBitsS = cctx->appliedParams.cParams.chainLog;
-    seqStore_t* seqStorePtr = &(cctx->seqStore);
-    const BYTE* const base = cctx->base;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
-    const U32 lowestIndex = cctx->dictLimit;
+    const U32 lowestIndex = ms->window.dictLimit;
    const BYTE* const lowest = base + lowestIndex;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - HASH_READ_SIZE;
-    U32 offset_1=seqStorePtr->rep[0], offset_2=seqStorePtr->rep[1];
+    U32 offset_1=rep[0], offset_2=rep[1];
    U32 offsetSaved = 0;

    /* init */
@ -76,7 +90,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
            /* favor repcode */
            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
            ip++;
-            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
        } else {
            U32 offset;
            if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
@ -99,14 +113,14 @@ size_t ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
                    while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
                }
            } else {
-                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
                continue;
            }

            offset_2 = offset_1;
            offset_1 = offset;

-            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
        }

        /* match found */
@ -129,61 +143,63 @@ size_t ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
                ip += rLength;
                anchor = ip;
                continue;   /* faster when present ... (?) */
    }   }   }

    /* save reps for next block */
-    seqStorePtr->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
-    seqStorePtr->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;

    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_doubleFast(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    const U32 mls = ctx->appliedParams.cParams.searchLength;
+    const U32 mls = cParams->searchLength;
    switch(mls)
    {
    default: /* includes case 3 */
    case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
    case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
    case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
    case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
    }
 }


-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
-                                 const void* src, size_t srcSize,
-                                 const U32 mls)
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+        U32 const mls /* template */)
 {
-    U32* const hashLong = ctx->hashTable;
-    U32  const hBitsL = ctx->appliedParams.cParams.hashLog;
-    U32* const hashSmall = ctx->chainTable;
-    U32  const hBitsS = ctx->appliedParams.cParams.chainLog;
-    seqStore_t* seqStorePtr = &(ctx->seqStore);
-    const BYTE* const base = ctx->base;
-    const BYTE* const dictBase = ctx->dictBase;
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
-    const U32   lowestIndex = ctx->lowLimit;
+    const U32   lowestIndex = ms->window.lowLimit;
    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ctx->dictLimit;
+    const U32   dictLimit = ms->window.dictLimit;
    const BYTE* const lowPrefixPtr = base + dictLimit;
    const BYTE* const dictEnd = dictBase + dictLimit;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - 8;
-    U32 offset_1=seqStorePtr->rep[0], offset_2=seqStorePtr->rep[1];
+    U32 offset_1=rep[0], offset_2=rep[1];

    /* Search Loop */
    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
@ -209,7 +225,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
            ip++;
-            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
        } else {
            if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
                const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
@ -220,7 +236,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                offset_2 = offset_1;
                offset_1 = offset;
-                ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);

            } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
@ -245,10 +261,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
                }
                offset_2 = offset_1;
                offset_1 = offset;
-                ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);

            } else {
-                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
                continue;
        }   }

@ -272,7 +288,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                    ip += repLength2;
@ -283,27 +299,29 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
    }   }   }

    /* save reps for next block */
-    seqStorePtr->repToConfirm[0] = offset_1; seqStorePtr->repToConfirm[1] = offset_2;
+    rep[0] = offset_1;
+    rep[1] = offset_2;

    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx* ctx,
-                         const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    U32 const mls = ctx->appliedParams.cParams.searchLength;
+    U32 const mls = cParams->searchLength;
    switch(mls)
    {
    default: /* includes case 3 */
    case 4 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 4);
    case 5 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 5);
    case 6 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 6);
    case 7 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 7);
    }
 }
--- a/sys/contrib/zstd/lib/compress/zstd_double_fast.h
+++ b/sys/contrib/zstd/lib/compress/zstd_double_fast.h
@ -16,11 +16,18 @@ extern "C" {
 #endif

 #include "mem.h"      /* U32 */
-#include "zstd.h"     /* ZSTD_CCtx, size_t */
+#include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              ZSTD_compressionParameters const* cParams,
+                              void const* end);
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);

-void ZSTD_fillDoubleHashTable(ZSTD_CCtx* cctx, const void* end, const U32 mls);
-size_t ZSTD_compressBlock_doubleFast(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);

 #if defined (__cplusplus)
 }
--- a/sys/contrib/zstd/lib/compress/zstd_fast.c
+++ b/sys/contrib/zstd/lib/compress/zstd_fast.c
@ -12,39 +12,48 @@
 #include "zstd_fast.h"


-void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls)
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        ZSTD_compressionParameters const* cParams,
+                        void const* end)
 {
-    U32* const hashTable = zc->hashTable;
-    U32  const hBits = zc->appliedParams.cParams.hashLog;
-    const BYTE* const base = zc->base;
-    const BYTE* ip = base + zc->nextToUpdate;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->searchLength;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
-    const size_t fastHashFillStep = 3;
+    const U32 fastHashFillStep = 3;

-    while(ip <= iend) {
-        hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
-        ip += fastHashFillStep;
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const hash = ZSTD_hashPtr(ip + i, hBits, mls);
+            if (i == 0 || hashTable[hash] == 0)
+                hashTable[hash] = current + i;
+        }
    }
 }

-
 FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
-                                 const void* src, size_t srcSize,
-                                 const U32 mls)
+size_t ZSTD_compressBlock_fast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const hlog, U32 const stepSize, U32 const mls)
 {
-    U32* const hashTable = cctx->hashTable;
-    U32  const hBits = cctx->appliedParams.cParams.hashLog;
-    seqStore_t* seqStorePtr = &(cctx->seqStore);
-    const BYTE* const base = cctx->base;
+    U32* const hashTable = ms->hashTable;
+    const BYTE* const base = ms->window.base;
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
-    const U32   lowestIndex = cctx->dictLimit;
+    const U32   lowestIndex = ms->window.dictLimit;
    const BYTE* const lowest = base + lowestIndex;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - HASH_READ_SIZE;
-    U32 offset_1=seqStorePtr->rep[0], offset_2=seqStorePtr->rep[1];
+    U32 offset_1=rep[0], offset_2=rep[1];
    U32 offsetSaved = 0;

    /* init */
@ -57,7 +66,7 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
    /* Main Search Loop */
    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
        size_t mLength;
-        size_t const h = ZSTD_hashPtr(ip, hBits, mls);
+        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
        U32 const current = (U32)(ip-base);
        U32 const matchIndex = hashTable[h];
        const BYTE* match = base + matchIndex;
@ -66,21 +75,21 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
            ip++;
-            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
        } else {
-            U32 offset;
-            if ( (matchIndex <= lowestIndex) || (MEM_read32(match) != MEM_read32(ip)) ) {
-                ip += ((ip-anchor) >> g_searchStrength) + 1;
+            if ( (matchIndex <= lowestIndex)
+              || (MEM_read32(match) != MEM_read32(ip)) ) {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                continue;
            }
            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            offset = (U32)(ip-match);
-            while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-            offset_2 = offset_1;
-            offset_1 = offset;
-
-            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
-        }
+            {   U32 const offset = (U32)(ip-match);
+                while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }   }

        /* match found */
        ip += mLength;
@ -88,8 +97,8 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,

        if (ip <= ilimit) {
            /* Fill Table */
-            hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2;  /* here because current+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
            /* check immediate repcode */
            while ( (ip <= ilimit)
                 && ( (offset_2>0)
@ -97,65 +106,67 @@ size_t ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
                /* store sequence */
                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; }  /* swap offset_2 <=> offset_1 */
-                hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH);
+                hashTable[ZSTD_hashPtr(ip, hlog, mls)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, rLength-MINMATCH);
                ip += rLength;
                anchor = ip;
                continue;   /* faster when present ... (?) */
    }   }   }

    /* save reps for next block */
-    seqStorePtr->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
-    seqStorePtr->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;

    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_fast(ZSTD_CCtx* ctx,
-                       const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    const U32 mls = ctx->appliedParams.cParams.searchLength;
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
    switch(mls)
    {
    default: /* includes case 3 */
    case 4 :
-        return ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
    case 5 :
-        return ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
    case 6 :
-        return ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
    case 7 :
-        return ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7);
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
    }
 }


-static size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
-                                 const void* src, size_t srcSize,
-                                 const U32 mls)
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const hlog, U32 const stepSize, U32 const mls)
 {
-    U32* hashTable = ctx->hashTable;
-    const U32 hBits = ctx->appliedParams.cParams.hashLog;
-    seqStore_t* seqStorePtr = &(ctx->seqStore);
-    const BYTE* const base = ctx->base;
-    const BYTE* const dictBase = ctx->dictBase;
+    U32* hashTable = ms->hashTable;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
-    const U32   lowestIndex = ctx->lowLimit;
+    const U32   lowestIndex = ms->window.lowLimit;
    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ctx->dictLimit;
+    const U32   dictLimit = ms->window.dictLimit;
    const BYTE* const lowPrefixPtr = base + dictLimit;
    const BYTE* const dictEnd = dictBase + dictLimit;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - 8;
-    U32 offset_1=seqStorePtr->rep[0], offset_2=seqStorePtr->rep[1];
+    U32 offset_1=rep[0], offset_2=rep[1];

    /* Search Loop */
    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
-        const size_t h = ZSTD_hashPtr(ip, hBits, mls);
+        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
        const U32 matchIndex = hashTable[h];
        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
        const BYTE* match = matchBase + matchIndex;
@ -171,11 +182,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
            ip++;
-            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, ip-anchor, anchor, 0, mLength-MINMATCH);
        } else {
            if ( (matchIndex < lowestIndex) ||
                 (MEM_read32(match) != MEM_read32(ip)) ) {
-                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
                continue;
            }
            {   const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
@ -186,7 +198,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
                offset = current - matchIndex;
                offset_2 = offset_1;
                offset_1 = offset;
-                ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
        }   }

        /* found a match : store it */
@ -195,8 +207,8 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,

        if (ip <= ilimit) {
            /* Fill Table */
-            hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2;
-            hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
            /* check immediate repcode */
            while (ip <= ilimit) {
                U32 const current2 = (U32)(ip-base);
@ -207,8 +219,8 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
-                    hashTable[ZSTD_hashPtr(ip, hBits, mls)] = current2;
+                    ZSTD_storeSeq(seqStore, 0, anchor, 0, repLength2-MINMATCH);
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
                    ip += repLength2;
                    anchor = ip;
                    continue;
@ -217,27 +229,31 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
    }   }   }

    /* save reps for next block */
-    seqStorePtr->repToConfirm[0] = offset_1; seqStorePtr->repToConfirm[1] = offset_2;
+    rep[0] = offset_1;
+    rep[1] = offset_2;

    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
-                         const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    U32 const mls = ctx->appliedParams.cParams.searchLength;
+    U32 const hlog = cParams->hashLog;
+    U32 const mls = cParams->searchLength;
+    U32 const stepSize = cParams->targetLength;
    switch(mls)
    {
    default: /* includes case 3 */
    case 4 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 4);
    case 5 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 5);
    case 6 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 6);
    case 7 :
-        return ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7);
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, hlog, stepSize, 7);
    }
 }
--- a/sys/contrib/zstd/lib/compress/zstd_fast.h
+++ b/sys/contrib/zstd/lib/compress/zstd_fast.h
@ -16,13 +16,17 @@ extern "C" {
 #endif

 #include "mem.h"      /* U32 */
-#include "zstd.h"     /* ZSTD_CCtx, size_t */
+#include "zstd_compress_internal.h"

-void ZSTD_fillHashTable(ZSTD_CCtx* zc, const void* end, const U32 mls);
-size_t ZSTD_compressBlock_fast(ZSTD_CCtx* ctx,
-                         const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
-                         const void* src, size_t srcSize);
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        ZSTD_compressionParameters const* cParams,
+                        void const* end);
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);

 #if defined (__cplusplus)
 }
--- a/sys/contrib/zstd/lib/compress/zstd_lazy.c
+++ b/sys/contrib/zstd/lib/compress/zstd_lazy.c
@ -15,76 +15,90 @@
 /*-*************************************
 *  Binary Tree search
 ***************************************/
-/** ZSTD_insertBt1() : add one or multiple positions to tree.
- *  ip : assumed <= iend-8 .
- * @return : nb of positions added */
-static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
-                const BYTE* const ip, const BYTE* const iend,
-                U32 nbCompares, U32 const mls, U32 const extDict)
+
+void ZSTD_updateDUBT(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* ip, const BYTE* iend,
+                U32 mls)
 {
-    U32*   const hashTable = zc->hashTable;
-    U32    const hashLog = zc->appliedParams.cParams.hashLog;
-    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
-    U32*   const bt = zc->chainTable;
-    U32    const btLog  = zc->appliedParams.cParams.chainLog - 1;
+    U32* const hashTable = ms->hashTable;
+    U32  const hashLog = cParams->hashLog;
+
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    if (idx != target)
+        DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+                    idx, target, ms->window.dictLimit);
+    assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
+    (void)iend;
+
+    assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
+    for ( ; idx < target ; idx++) {
+        size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
+        U32    const matchIndex = hashTable[h];
+
+        U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
+        U32*   const sortMarkPtr  = nextCandidatePtr + 1;
+
+        DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+        hashTable[h] = idx;   /* Update Hash Table */
+        *nextCandidatePtr = matchIndex;   /* update BT like a chain */
+        *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+    }
+    ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ *  sort one already inserted but unsorted position
+ *  assumption : current >= btlow == (current - btmask)
+ *  doesn't fail */
+static void ZSTD_insertDUBT1(
+                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                 U32 current, const BYTE* inputEnd,
+                 U32 nbCompares, U32 btLow, int extDict)
+{
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
    U32    const btMask = (1 << btLog) - 1;
-    U32 matchIndex = hashTable[h];
    size_t commonLengthSmaller=0, commonLengthLarger=0;
-    const BYTE* const base = zc->base;
-    const BYTE* const dictBase = zc->dictBase;
-    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
+    const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
    const BYTE* const dictEnd = dictBase + dictLimit;
    const BYTE* const prefixStart = base + dictLimit;
    const BYTE* match;
-    const U32 current = (U32)(ip-base);
-    const U32 btLow = btMask >= current ? 0 : current - btMask;
    U32* smallerPtr = bt + 2*(current&btMask);
    U32* largerPtr  = smallerPtr + 1;
+    U32 matchIndex = *smallerPtr;
    U32 dummy32;   /* to be nullified at the end */
-    U32 const windowLow = zc->lowLimit;
-    U32 matchEndIdx = current+8+1;
-    size_t bestLength = 8;
-#ifdef ZSTD_C_PREDICT
-    U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
-    U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
-    predictedSmall += (predictedSmall>0);
-    predictedLarge += (predictedLarge>0);
-#endif /* ZSTD_C_PREDICT */
+    U32 const windowLow = ms->window.lowLimit;

-    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current);
-
-    assert(ip <= iend-8);   /* required for h calculation */
-    hashTable[h] = current;   /* Update Hash Table */
+    DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+                current, dictLimit, windowLow);
+    assert(current >= btLow);
+    assert(ip < iend);   /* condition for ZSTD_count */

    while (nbCompares-- && (matchIndex > windowLow)) {
        U32* const nextPtr = bt + 2*(matchIndex & btMask);
        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
        assert(matchIndex < current);

-#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
-        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
-        if (matchIndex == predictedSmall) {
-            /* no need to check length, result known */
-            *smallerPtr = matchIndex;
-            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
-            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
-            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
-            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
-            continue;
-        }
-        if (matchIndex == predictedLarge) {
-            *largerPtr = matchIndex;
-            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
-            largerPtr = nextPtr;
-            matchIndex = nextPtr[0];
-            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
-            continue;
-        }
-#endif
-
-        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
-            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if extDict is incorrectly set to 0 */
-            match = base + matchIndex;
+        if ( (!extDict)
+          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
+          || (current < dictLimit) /* both in extDict */) {
+            const BYTE* const mBase = !extDict || ((matchIndex+matchLength) >= dictLimit) ? base : dictBase;
+            assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
+                 || (current < dictLimit) );
+            match = mBase + matchIndex;
            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
        } else {
            match = dictBase + matchIndex;
@ -93,11 +107,8 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
        }

-        if (matchLength > bestLength) {
-            bestLength = matchLength;
-            if (matchLength > matchEndIdx - matchIndex)
-                matchEndIdx = matchIndex + (U32)matchLength;
-        }
+        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+                    current, matchIndex, (U32)matchLength);

        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
@ -108,6 +119,8 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
            *smallerPtr = matchIndex;             /* update smaller idx */
            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+                        matchIndex, btLow, nextPtr[1]);
            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
        } else {
@ -115,184 +128,205 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
            *largerPtr = matchIndex;
            commonLengthLarger = matchLength;
            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+                        matchIndex, btLow, nextPtr[0]);
            largerPtr = nextPtr;
            matchIndex = nextPtr[0];
    }   }

    *smallerPtr = *largerPtr = 0;
-    if (bestLength > 384) return MIN(192, (U32)(bestLength - 384));   /* speed optimization */
-    assert(matchEndIdx > current + 8);
-    return matchEndIdx - (current + 8);
-}
-
-FORCE_INLINE_TEMPLATE
-void ZSTD_updateTree_internal(ZSTD_CCtx* zc,
-                const BYTE* const ip, const BYTE* const iend,
-                const U32 nbCompares, const U32 mls, const U32 extDict)
-{
-    const BYTE* const base = zc->base;
-    U32 const target = (U32)(ip - base);
-    U32 idx = zc->nextToUpdate;
-    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (extDict:%u)",
-                idx, target, extDict);
-
-    while(idx < target)
-        idx += ZSTD_insertBt1(zc, base+idx, iend, nbCompares, mls, extDict);
-    zc->nextToUpdate = target;
-}
-
-void ZSTD_updateTree(ZSTD_CCtx* zc,
-                const BYTE* const ip, const BYTE* const iend,
-                const U32 nbCompares, const U32 mls)
-{
-    ZSTD_updateTree_internal(zc, ip, iend, nbCompares, mls, 0 /*extDict*/);
-}
-
-void ZSTD_updateTree_extDict(ZSTD_CCtx* zc,
-                const BYTE* const ip, const BYTE* const iend,
-                const U32 nbCompares, const U32 mls)
-{
-    ZSTD_updateTree_internal(zc, ip, iend, nbCompares, mls, 1 /*extDict*/);
 }


-static size_t ZSTD_insertBtAndFindBestMatch (
-                        ZSTD_CCtx* zc,
-                        const BYTE* const ip, const BYTE* const iend,
-                        size_t* offsetPtr,
-                        U32 nbCompares, const U32 mls,
-                        U32 extDict)
+static size_t ZSTD_DUBT_findBestMatch (
+                            ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                            const BYTE* const ip, const BYTE* const iend,
+                            size_t* offsetPtr,
+                            U32 const mls,
+                            U32 const extDict)
 {
-    U32*   const hashTable = zc->hashTable;
-    U32    const hashLog = zc->appliedParams.cParams.hashLog;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
-    U32*   const bt = zc->chainTable;
-    U32    const btLog  = zc->appliedParams.cParams.chainLog - 1;
+    U32          matchIndex  = hashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    U32    const current = (U32)(ip-base);
+    U32    const windowLow = ms->window.lowLimit;
+
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
    U32    const btMask = (1 << btLog) - 1;
-    U32 matchIndex  = hashTable[h];
-    size_t commonLengthSmaller=0, commonLengthLarger=0;
-    const BYTE* const base = zc->base;
-    const BYTE* const dictBase = zc->dictBase;
-    const U32 dictLimit = zc->dictLimit;
-    const BYTE* const dictEnd = dictBase + dictLimit;
-    const BYTE* const prefixStart = base + dictLimit;
-    const U32 current = (U32)(ip-base);
-    const U32 btLow = btMask >= current ? 0 : current - btMask;
-    const U32 windowLow = zc->lowLimit;
-    U32* smallerPtr = bt + 2*(current&btMask);
-    U32* largerPtr  = bt + 2*(current&btMask) + 1;
-    U32 matchEndIdx = current+8+1;
-    U32 dummy32;   /* to be nullified at the end */
-    size_t bestLength = 0;
+    U32    const btLow = (btMask >= current) ? 0 : current - btMask;
+    U32    const unsortLimit = MAX(btLow, windowLow);

+    U32*         nextCandidate = bt + 2*(matchIndex&btMask);
+    U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+    U32          nbCompares = 1U << cParams->searchLog;
+    U32          nbCandidates = nbCompares;
+    U32          previousCandidate = 0;
+
+    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
    assert(ip <= iend-8);   /* required for h calculation */
-    hashTable[h] = current;   /* Update Hash Table */

-    while (nbCompares-- && (matchIndex > windowLow)) {
-        U32* const nextPtr = bt + 2*(matchIndex & btMask);
-        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
-        const BYTE* match;
+    /* reach end of unsorted candidates list */
+    while ( (matchIndex > unsortLimit)
+         && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+         && (nbCandidates > 1) ) {
+        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+                    matchIndex);
+        *unsortedMark = previousCandidate;
+        previousCandidate = matchIndex;
+        matchIndex = *nextCandidate;
+        nextCandidate = bt + 2*(matchIndex&btMask);
+        unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+        nbCandidates --;
+    }

-        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
-            match = base + matchIndex;
-            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
-        } else {
-            match = dictBase + matchIndex;
-            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
-            if (matchIndex+matchLength >= dictLimit)
-                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
-        }
+    if ( (matchIndex > unsortLimit)
+      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+                    matchIndex);
+        *nextCandidate = *unsortedMark = 0;   /* nullify next candidate if it's still unsorted (note : simplification, detrimental to compression ratio, beneficial for speed) */
+    }

-        if (matchLength > bestLength) {
-            if (matchLength > matchEndIdx - matchIndex)
-                matchEndIdx = matchIndex + (U32)matchLength;
-            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
-                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
-            if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
-                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+    /* batch sort stacked candidates */
+    matchIndex = previousCandidate;
+    while (matchIndex) {  /* will end on matchIndex == 0 */
+        U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+        U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+        ZSTD_insertDUBT1(ms, cParams, matchIndex, iend,
+                         nbCandidates, unsortLimit, extDict);
+        matchIndex = nextCandidateIdx;
+        nbCandidates++;
+    }
+
+    /* find longest match */
+    {   size_t commonLengthSmaller=0, commonLengthLarger=0;
+        const BYTE* const dictBase = ms->window.dictBase;
+        const U32 dictLimit = ms->window.dictLimit;
+        const BYTE* const dictEnd = dictBase + dictLimit;
+        const BYTE* const prefixStart = base + dictLimit;
+        U32* smallerPtr = bt + 2*(current&btMask);
+        U32* largerPtr  = bt + 2*(current&btMask) + 1;
+        U32 matchEndIdx = current+8+1;
+        U32 dummy32;   /* to be nullified at the end */
+        size_t bestLength = 0;
+
+        matchIndex  = hashTable[h];
+        hashTable[h] = current;   /* Update Hash Table */
+
+        while (nbCompares-- && (matchIndex > windowLow)) {
+            U32* const nextPtr = bt + 2*(matchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match;
+
+            if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+                match = base + matchIndex;
+                matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+            } else {
+                match = dictBase + matchIndex;
+                matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+                if (matchIndex+matchLength >= dictLimit)
+                    match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
            }
+
+            if (matchLength > bestLength) {
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+                if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (match[matchLength] < ip[matchLength]) {
+                /* match is smaller than current */
+                *smallerPtr = matchIndex;             /* update smaller idx */
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+                matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                *largerPtr = matchIndex;
+                commonLengthLarger = matchLength;
+                if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                largerPtr = nextPtr;
+                matchIndex = nextPtr[0];
+        }   }
+
+        *smallerPtr = *largerPtr = 0;
+
+        assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
+        ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+        if (bestLength >= MINMATCH) {
+            U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+            DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                        current, (U32)bestLength, (U32)*offsetPtr, mIndex);
        }
-
-        if (match[matchLength] < ip[matchLength]) {
-            /* match is smaller than current */
-            *smallerPtr = matchIndex;             /* update smaller idx */
-            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
-            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
-            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
-            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
-        } else {
-            /* match is larger than current */
-            *largerPtr = matchIndex;
-            commonLengthLarger = matchLength;
-            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
-            largerPtr = nextPtr;
-            matchIndex = nextPtr[0];
-    }   }
-
-    *smallerPtr = *largerPtr = 0;
-
-    assert(matchEndIdx > current+8);
-    zc->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
-    return bestLength;
+        return bestLength;
+    }
 }


 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
 static size_t ZSTD_BtFindBestMatch (
-                        ZSTD_CCtx* zc,
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* const ip, const BYTE* const iLimit,
                        size_t* offsetPtr,
-                        const U32 maxNbAttempts, const U32 mls)
+                        const U32 mls /* template */)
 {
-    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
-    return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0);
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 0);
 }


 static size_t ZSTD_BtFindBestMatch_selectMLS (
-                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+                        size_t* offsetPtr)
 {
-    switch(matchLengthSearch)
+    switch(cParams->searchLength)
    {
    default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
-    case 5 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+    case 4 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 4);
+    case 5 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 5);
    case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+    case 6 : return ZSTD_BtFindBestMatch(ms, cParams, ip, iLimit, offsetPtr, 6);
    }
 }


 /** Tree updater, providing best match */
 static size_t ZSTD_BtFindBestMatch_extDict (
-                        ZSTD_CCtx* zc,
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* const ip, const BYTE* const iLimit,
                        size_t* offsetPtr,
-                        const U32 maxNbAttempts, const U32 mls)
+                        const U32 mls)
 {
-    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
-    ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
-    return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1);
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch_extDict");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, cParams, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, cParams, ip, iLimit, offsetPtr, mls, 1);
 }


 static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
-                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+                        size_t* offsetPtr)
 {
-    switch(matchLengthSearch)
+    switch(cParams->searchLength)
    {
    default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
-    case 5 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+    case 4 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 4);
+    case 5 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 5);
    case 7 :
-    case 6 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+    case 6 : return ZSTD_BtFindBestMatch_extDict(ms, cParams, ip, iLimit, offsetPtr, 6);
    }
 }

@ -305,15 +339,17 @@ static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (

 /* Update chains up to ip (excluded)
   Assumption : always within prefix (i.e. not within extDict) */
-U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls)
+static U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, U32 const mls)
 {
-    U32* const hashTable  = zc->hashTable;
-    const U32 hashLog = zc->appliedParams.cParams.hashLog;
-    U32* const chainTable = zc->chainTable;
-    const U32 chainMask = (1 << zc->appliedParams.cParams.chainLog) - 1;
-    const BYTE* const base = zc->base;
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
    const U32 target = (U32)(ip - base);
-    U32 idx = zc->nextToUpdate;
+    U32 idx = ms->nextToUpdate;

    while(idx < target) { /* catch up */
        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
@ -322,35 +358,42 @@ U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls)
        idx++;
    }

-    zc->nextToUpdate = target;
+    ms->nextToUpdate = target;
    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
 }

+U32 ZSTD_insertAndFindFirstIndex(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip)
+{
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, cParams->searchLength);
+}
+

 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_HcFindBestMatch_generic (
-                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* const ip, const BYTE* const iLimit,
                        size_t* offsetPtr,
-                        const U32 maxNbAttempts, const U32 mls, const U32 extDict)
+                        const U32 mls, const U32 extDict)
 {
-    U32* const chainTable = zc->chainTable;
-    const U32 chainSize = (1 << zc->appliedParams.cParams.chainLog);
+    U32* const chainTable = ms->chainTable;
+    const U32 chainSize = (1 << cParams->chainLog);
    const U32 chainMask = chainSize-1;
-    const BYTE* const base = zc->base;
-    const BYTE* const dictBase = zc->dictBase;
-    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
    const BYTE* const prefixStart = base + dictLimit;
    const BYTE* const dictEnd = dictBase + dictLimit;
-    const U32 lowLimit = zc->lowLimit;
+    const U32 lowLimit = ms->window.lowLimit;
    const U32 current = (U32)(ip-base);
    const U32 minChain = current > chainSize ? current - chainSize : 0;
-    int nbAttempts=maxNbAttempts;
+    U32 nbAttempts = 1U << cParams->searchLog;
    size_t ml=4-1;

    /* HC4 match finder */
-    U32 matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls);
+    U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);

    for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
        size_t currentMl=0;
@ -381,35 +424,33 @@ size_t ZSTD_HcFindBestMatch_generic (


 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
-                        ZSTD_CCtx* zc,
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr,
-                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+                        size_t* offsetPtr)
 {
-    switch(matchLengthSearch)
+    switch(cParams->searchLength)
    {
    default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0);
-    case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 0);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 0);
    case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 0);
    }
 }


 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
-                        ZSTD_CCtx* const zc,
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* const offsetPtr,
-                        U32 const maxNbAttempts, U32 const matchLengthSearch)
+                        size_t* const offsetPtr)
 {
-    switch(matchLengthSearch)
+    switch(cParams->searchLength)
    {
    default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1);
-    case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1);
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 4, 1);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 5, 1);
    case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1);
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, cParams, ip, iLimit, offsetPtr, 6, 1);
    }
 }

@ -418,30 +459,29 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
 *  Common parser - lazy strategy
 *********************************/
 FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
-                                       const void* src, size_t srcSize,
-                                       const U32 searchMethod, const U32 depth)
+size_t ZSTD_compressBlock_lazy_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        ZSTD_compressionParameters const* cParams,
+                        const void* src, size_t srcSize,
+                        const U32 searchMethod, const U32 depth)
 {
-    seqStore_t* seqStorePtr = &(ctx->seqStore);
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - 8;
-    const BYTE* const base = ctx->base + ctx->dictLimit;
+    const BYTE* const base = ms->window.base + ms->window.dictLimit;

-    U32 const maxSearches = 1 << ctx->appliedParams.cParams.searchLog;
-    U32 const mls = ctx->appliedParams.cParams.searchLength;
-
-    typedef size_t (*searchMax_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLimit,
-                        size_t* offsetPtr,
-                        U32 maxNbAttempts, U32 matchLengthSearch);
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
    searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
-    U32 offset_1 = seqStorePtr->rep[0], offset_2 = seqStorePtr->rep[1], savedOffset=0;
+    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;

    /* init */
    ip += (ip==base);
-    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    ms->nextToUpdate3 = ms->nextToUpdate;
    {   U32 const maxRep = (U32)(ip-base);
        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
@ -462,13 +502,13 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,

        /* first search (depth 0) */
        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
            if (ml2 > matchLength)
                matchLength = ml2, start = ip, offset=offsetFound;
        }

        if (matchLength < 4) {
-            ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
            continue;
        }

@ -484,7 +524,7 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
                    matchLength = mlRep, offset = 0, start = ip;
            }
            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                if ((ml2 >= 4) && (gain2 > gain1)) {
@ -503,7 +543,7 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
                        matchLength = ml2, offset = 0, start = ip;
                }
                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                    if ((ml2 >= 4) && (gain2 > gain1)) {
@ -528,7 +568,7 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
        /* store sequence */
 _storeSequence:
        {   size_t const litLength = start - anchor;
-            ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
            anchor = ip = start + matchLength;
        }

@ -538,73 +578,80 @@ _storeSequence:
            /* store sequence */
            matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
            offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
-            ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
            ip += matchLength;
            anchor = ip;
            continue;   /* faster when present ... (?) */
    }   }

    /* Save reps for next block */
-    seqStorePtr->repToConfirm[0] = offset_1 ? offset_1 : savedOffset;
-    seqStorePtr->repToConfirm[1] = offset_2 ? offset_2 : savedOffset;
+    rep[0] = offset_1 ? offset_1 : savedOffset;
+    rep[1] = offset_2 ? offset_2 : savedOffset;

    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_btlazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
 }

-size_t ZSTD_compressBlock_lazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
 }

-size_t ZSTD_compressBlock_lazy(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
 }

-size_t ZSTD_compressBlock_greedy(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
 }


 FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
-                                     const void* src, size_t srcSize,
-                                     const U32 searchMethod, const U32 depth)
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        ZSTD_compressionParameters const* cParams,
+                        const void* src, size_t srcSize,
+                        const U32 searchMethod, const U32 depth)
 {
-    seqStore_t* seqStorePtr = &(ctx->seqStore);
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - 8;
-    const BYTE* const base = ctx->base;
-    const U32 dictLimit = ctx->dictLimit;
-    const U32 lowestIndex = ctx->lowLimit;
+    const BYTE* const base = ms->window.base;
+    const U32 dictLimit = ms->window.dictLimit;
+    const U32 lowestIndex = ms->window.lowLimit;
    const BYTE* const prefixStart = base + dictLimit;
-    const BYTE* const dictBase = ctx->dictBase;
+    const BYTE* const dictBase = ms->window.dictBase;
    const BYTE* const dictEnd  = dictBase + dictLimit;
-    const BYTE* const dictStart  = dictBase + ctx->lowLimit;
+    const BYTE* const dictStart  = dictBase + lowestIndex;

-    const U32 maxSearches = 1 << ctx->appliedParams.cParams.searchLog;
-    const U32 mls = ctx->appliedParams.cParams.searchLength;
-
-    typedef size_t (*searchMax_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLimit,
-                        size_t* offsetPtr,
-                        U32 maxNbAttempts, U32 matchLengthSearch);
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;

-    U32 offset_1 = seqStorePtr->rep[0], offset_2 = seqStorePtr->rep[1];
+    U32 offset_1 = rep[0], offset_2 = rep[1];

    /* init */
-    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    ms->nextToUpdate3 = ms->nextToUpdate;
    ip += (ip == prefixStart);

    /* Match Loop */
@ -628,13 +675,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,

        /* first search (depth 0) */
        {   size_t offsetFound = 99999999;
-            size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+            size_t const ml2 = searchMax(ms, cParams, ip, iend, &offsetFound);
            if (ml2 > matchLength)
                matchLength = ml2, start = ip, offset=offsetFound;
        }

         if (matchLength < 4) {
-            ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
            continue;
        }

@ -661,7 +708,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,

            /* search match, depth 1 */
            {   size_t offset2=99999999;
-                size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
                if ((ml2 >= 4) && (gain2 > gain1)) {
@ -691,7 +738,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,

                /* search match, depth 2 */
                {   size_t offset2=99999999;
-                    size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                    size_t const ml2 = searchMax(ms, cParams, ip, iend, &offset2);
                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
                    if ((ml2 >= 4) && (gain2 > gain1)) {
@ -713,7 +760,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
        /* store sequence */
 _storeSequence:
        {   size_t const litLength = start - anchor;
-            ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, litLength, anchor, (U32)offset, matchLength-MINMATCH);
            anchor = ip = start + matchLength;
        }

@ -728,7 +775,7 @@ _storeSequence:
                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
-                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, 0, anchor, 0, matchLength-MINMATCH);
                ip += matchLength;
                anchor = ip;
                continue;   /* faster when present ... (?) */
@ -737,29 +784,41 @@ _storeSequence:
    }   }

    /* Save reps for next block */
-    seqStorePtr->repToConfirm[0] = offset_1; seqStorePtr->repToConfirm[1] = offset_2;
+    rep[0] = offset_1;
+    rep[1] = offset_2;

    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 0);
 }

-size_t ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 1);
 }

-size_t ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 0, 2);
 }

-size_t ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
+
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, cParams, src, srcSize, 1, 2);
 }
--- a/sys/contrib/zstd/lib/compress/zstd_lazy.h
+++ b/sys/contrib/zstd/lib/compress/zstd_lazy.h
@ -15,22 +15,39 @@
 extern "C" {
 #endif

-#include "mem.h"    /* U32 */
-#include "zstd.h"   /* ZSTD_CCtx, size_t */
+#include "zstd_compress_internal.h"

-U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls);
-void ZSTD_updateTree(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls);
-void ZSTD_updateTree_extDict(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls);
+U32 ZSTD_insertAndFindFirstIndex(
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* ip);

-size_t ZSTD_compressBlock_btlazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK */

-size_t ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);

 #if defined (__cplusplus)
 }
--- a/sys/contrib/zstd/lib/compress/zstd_ldm.c
+++ b/sys/contrib/zstd/lib/compress/zstd_ldm.c
@ -17,36 +17,45 @@
 #define LDM_HASH_RLOG 7
 #define LDM_HASH_CHAR_OFFSET 10

-size_t ZSTD_ldm_initializeParameters(ldmParams_t* params, U32 enableLdm)
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
 {
+    U32 const windowLog = cParams->windowLog;
    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
-    params->enableLdm = enableLdm>0;
-    params->hashLog = 0;
-    params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
-    params->minMatchLength = LDM_MIN_MATCH_LENGTH;
-    params->hashEveryLog = ZSTD_LDM_HASHEVERYLOG_NOTSET;
-    return 0;
-}
-
-void ZSTD_ldm_adjustParameters(ldmParams_t* params, U32 windowLog)
-{
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (cParams->strategy >= ZSTD_btopt) {
+      /* Get out of the way of the optimal parser */
+      U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength);
+      assert(minMatch >= ZSTD_LDM_MINMATCH_MIN);
+      assert(minMatch <= ZSTD_LDM_MINMATCH_MAX);
+      params->minMatchLength = minMatch;
+    }
    if (params->hashLog == 0) {
        params->hashLog = MAX(ZSTD_HASHLOG_MIN, windowLog - LDM_HASH_RLOG);
        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
    }
-    if (params->hashEveryLog == ZSTD_LDM_HASHEVERYLOG_NOTSET) {
+    if (params->hashEveryLog == 0) {
        params->hashEveryLog =
                windowLog < params->hashLog ? 0 : windowLog - params->hashLog;
    }
    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
 }

-size_t ZSTD_ldm_getTableSize(U32 hashLog, U32 bucketSizeLog) {
-    size_t const ldmHSize = ((size_t)1) << hashLog;
-    size_t const ldmBucketSizeLog = MIN(bucketSizeLog, hashLog);
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
    size_t const ldmBucketSize =
-        ((size_t)1) << (hashLog - ldmBucketSizeLog);
-    return ldmBucketSize + (ldmHSize * (sizeof(ldmEntry_t)));
+        ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ldmBucketSize + ldmHSize * sizeof(ldmEntry_t);
+    return params.enableLdm ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
 }

 /** ZSTD_ldm_getSmallHash() :
@ -167,6 +176,7 @@ static U64 ZSTD_ldm_ipow(U64 base, U64 exp)
 }

 U64 ZSTD_ldm_getHashPower(U32 minMatchLength) {
+    DEBUGLOG(4, "ZSTD_ldm_getHashPower: mml=%u", minMatchLength);
    assert(minMatchLength >= ZSTD_LDM_MINMATCH_MIN);
    return ZSTD_ldm_ipow(prime8bytes, minMatchLength - 1);
 }
@ -205,21 +215,22 @@ static size_t ZSTD_ldm_countBackwardsMatch(
 *
 *  The tables for the other strategies are filled within their
 *  block compressors. */
-static size_t ZSTD_ldm_fillFastTables(ZSTD_CCtx* zc, const void* end)
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      ZSTD_compressionParameters const* cParams,
+                                      void const* end)
 {
    const BYTE* const iend = (const BYTE*)end;
-    const U32 mls = zc->appliedParams.cParams.searchLength;

-    switch(zc->appliedParams.cParams.strategy)
+    switch(cParams->strategy)
    {
    case ZSTD_fast:
-        ZSTD_fillHashTable(zc, iend, mls);
-        zc->nextToUpdate = (U32)(iend - zc->base);
+        ZSTD_fillHashTable(ms, cParams, iend);
+        ms->nextToUpdate = (U32)(iend - ms->window.base);
        break;

    case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(zc, iend, mls);
-        zc->nextToUpdate = (U32)(iend - zc->base);
+        ZSTD_fillDoubleHashTable(ms, cParams, iend);
+        ms->nextToUpdate = (U32)(iend - ms->window.base);
        break;

    case ZSTD_greedy:
@ -268,69 +279,62 @@ static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
 *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
 *  if it is far way
 *  (after a long match, only update tables a limited amount). */
-static void ZSTD_ldm_limitTableUpdate(ZSTD_CCtx* cctx, const BYTE* anchor)
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
 {
-    U32 const current = (U32)(anchor - cctx->base);
-    if (current > cctx->nextToUpdate + 1024) {
-        cctx->nextToUpdate =
-            current - MIN(512, current - cctx->nextToUpdate - 1024);
+    U32 const current = (U32)(anchor - ms->window.base);
+    if (current > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            current - MIN(512, current - ms->nextToUpdate - 1024);
    }
 }

-typedef size_t (*ZSTD_blockCompressor) (ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-/* defined in zstd_compress.c */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict);
-
-FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_ldm_generic(ZSTD_CCtx* cctx,
-                                      const void* src, size_t srcSize)
+static size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
 {
-    ldmState_t* const ldmState = &(cctx->ldmState);
-    const ldmParams_t ldmParams = cctx->appliedParams.ldmParams;
-    const U64 hashPower = ldmState->hashPower;
-    const U32 hBits = ldmParams.hashLog - ldmParams.bucketSizeLog;
-    const U32 ldmBucketSize = ((U32)1 << ldmParams.bucketSizeLog);
-    const U32 ldmTagMask = ((U32)1 << ldmParams.hashEveryLog) - 1;
-    seqStore_t* const seqStorePtr = &(cctx->seqStore);
-    const BYTE* const base = cctx->base;
-    const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
-    const BYTE* anchor = istart;
-    const U32   lowestIndex = cctx->dictLimit;
-    const BYTE* const lowest = base + lowestIndex;
-    const BYTE* const iend = istart + srcSize;
-    const BYTE* const ilimit = iend - MAX(ldmParams.minMatchLength, HASH_READ_SIZE);
-
-    const ZSTD_blockCompressor blockCompressor =
-        ZSTD_selectBlockCompressor(cctx->appliedParams.cParams.strategy, 0);
-    U32* const repToConfirm = seqStorePtr->repToConfirm;
-    U32 savedRep[ZSTD_REP_NUM];
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U64 const hashPower = ldmState->hashPower;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    U32 const ldmBucketSize = 1U << params->bucketSizeLog;
+    U32 const hashEveryLog = params->hashEveryLog;
+    U32 const ldmTagMask = (1U << params->hashEveryLog) - 1;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE);
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash */
+    BYTE const* lastHashed = NULL;
    U64 rollingHash = 0;
-    const BYTE* lastHashed = NULL;
-    size_t i, lastLiterals;

-    /* Save seqStorePtr->rep and copy repToConfirm */
-    for (i = 0; i < ZSTD_REP_NUM; i++)
-        savedRep[i] = repToConfirm[i] = seqStorePtr->rep[i];
-
-    /* Main Search Loop */
-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    while (ip <= ilimit) {
        size_t mLength;
        U32 const current = (U32)(ip - base);
        size_t forwardMatchLength = 0, backwardMatchLength = 0;
        ldmEntry_t* bestEntry = NULL;
        if (ip != istart) {
            rollingHash = ZSTD_ldm_updateHash(rollingHash, lastHashed[0],
-                                              lastHashed[ldmParams.minMatchLength],
+                                              lastHashed[minMatchLength],
                                              hashPower);
        } else {
-            rollingHash = ZSTD_ldm_getRollingHash(ip, ldmParams.minMatchLength);
+            rollingHash = ZSTD_ldm_getRollingHash(ip, minMatchLength);
        }
        lastHashed = ip;

        /* Do not insert and do not look for a match */
-        if (ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashEveryLog) !=
-                ldmTagMask) {
+        if (ZSTD_ldm_getTag(rollingHash, hBits, hashEveryLog) != ldmTagMask) {
           ip++;
           continue;
        }
@ -340,27 +344,49 @@ size_t ZSTD_compressBlock_ldm_generic(ZSTD_CCtx* cctx,
            ldmEntry_t* const bucket =
                ZSTD_ldm_getBucket(ldmState,
                                   ZSTD_ldm_getSmallHash(rollingHash, hBits),
-                                   ldmParams);
+                                   *params);
            ldmEntry_t* cur;
            size_t bestMatchLength = 0;
            U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);

            for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
-                const BYTE* const pMatch = cur->offset + base;
                size_t curForwardMatchLength, curBackwardMatchLength,
                       curTotalMatchLength;
                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
                    continue;
                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;

-                curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
-                if (curForwardMatchLength < ldmParams.minMatchLength) {
-                    continue;
+                    curForwardMatchLength = ZSTD_count_2segments(
+                                                ip, pMatch, iend,
+                                                matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowMatchPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowPrefixPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
                }
-                curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch(
-                                             ip, anchor, pMatch, lowest);
-                curTotalMatchLength = curForwardMatchLength +
-                                      curBackwardMatchLength;

                if (curTotalMatchLength > bestMatchLength) {
                    bestMatchLength = curTotalMatchLength;
@ -375,7 +401,7 @@ size_t ZSTD_compressBlock_ldm_generic(ZSTD_CCtx* cctx,
        if (bestEntry == NULL) {
            ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash,
                                             hBits, current,
-                                             ldmParams);
+                                             *params);
            ip++;
            continue;
        }
@ -384,324 +410,244 @@ size_t ZSTD_compressBlock_ldm_generic(ZSTD_CCtx* cctx,
        mLength = forwardMatchLength + backwardMatchLength;
        ip -= backwardMatchLength;

-        /* Call the block compressor on the remaining literals */
        {
+            /* Store the sequence:
+             * ip = current - backwardMatchLength
+             * The match is at (bestEntry->offset - backwardMatchLength)
+             */
            U32 const matchIndex = bestEntry->offset;
-            const BYTE* const match = base + matchIndex - backwardMatchLength;
-            U32 const offset = (U32)(ip - match);
+            U32 const offset = current - matchIndex;
+            rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;

-            /* Overwrite rep codes */
-            for (i = 0; i < ZSTD_REP_NUM; i++)
-                seqStorePtr->rep[i] = repToConfirm[i];
-
-            /* Fill tables for block compressor */
-            ZSTD_ldm_limitTableUpdate(cctx, anchor);
-            ZSTD_ldm_fillFastTables(cctx, anchor);
-
-            /* Call block compressor and get remaining literals */
-            lastLiterals = blockCompressor(cctx, anchor, ip - anchor);
-            cctx->nextToUpdate = (U32)(ip - base);
-
-            /* Update repToConfirm with the new offset */
-            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
-                repToConfirm[i] = repToConfirm[i-1];
-            repToConfirm[0] = offset;
-
-            /* Store the sequence with the leftover literals */
-            ZSTD_storeSeq(seqStorePtr, lastLiterals, ip - lastLiterals,
-                          offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+            /* Out of sequence storage */
+            if (rawSeqStore->size == rawSeqStore->capacity)
+                return ERROR(dstSize_tooSmall);
+            seq->litLength = (U32)(ip - anchor);
+            seq->matchLength = (U32)mLength;
+            seq->offset = offset;
+            rawSeqStore->size++;
        }

        /* Insert the current entry into the hash table */
        ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
                                         (U32)(lastHashed - base),
-                                         ldmParams);
+                                         *params);

        assert(ip + backwardMatchLength == lastHashed);

        /* Fill the hash table from lastHashed+1 to ip+mLength*/
        /* Heuristic: don't need to fill the entire table at end of block */
-        if (ip + mLength < ilimit) {
+        if (ip + mLength <= ilimit) {
            rollingHash = ZSTD_ldm_fillLdmHashTable(
                              ldmState, rollingHash, lastHashed,
-                              ip + mLength, base, hBits, ldmParams);
+                              ip + mLength, base, hBits, *params);
            lastHashed = ip + mLength - 1;
        }
        ip += mLength;
        anchor = ip;
-        /* Check immediate repcode */
-        while ( (ip < ilimit)
-             && ( (repToConfirm[1] > 0) && (repToConfirm[1] <= (U32)(ip-lowest))
-             && (MEM_read32(ip) == MEM_read32(ip - repToConfirm[1])) )) {
+    }
+    return iend - anchor;
+}

-            size_t const rLength = ZSTD_count(ip+4, ip+4-repToConfirm[1],
-                                              iend) + 4;
-            /* Swap repToConfirm[1] <=> repToConfirm[0] */
-            {
-                U32 const tmpOff = repToConfirm[1];
-                repToConfirm[1] = repToConfirm[0];
-                repToConfirm[0] = tmpOff;
-            }
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}

-            ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH);
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;

-            /* Fill the  hash table from lastHashed+1 to ip+rLength*/
-            if (ip + rLength < ilimit) {
-                rollingHash = ZSTD_ldm_fillLdmHashTable(
-                                ldmState, rollingHash, lastHashed,
-                                ip + rLength, base, hBits, ldmParams);
-                lastHashed = ip + rLength - 1;
-            }
-            ip += rLength;
-            anchor = ip;
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximmum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, src);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
        }
-    }
-
-    /* Overwrite rep */
-    for (i = 0; i < ZSTD_REP_NUM; i++)
-        seqStorePtr->rep[i] = repToConfirm[i];
-
-    ZSTD_ldm_limitTableUpdate(cctx, anchor);
-    ZSTD_ldm_fillFastTables(cctx, anchor);
-
-    lastLiterals = blockCompressor(cctx, anchor, iend - anchor);
-    cctx->nextToUpdate = (U32)(iend - base);
-
-    /* Restore seqStorePtr->rep */
-    for (i = 0; i < ZSTD_REP_NUM; i++)
-        seqStorePtr->rep[i] = savedRep[i];
-
-    /* Return the last literals size */
-    return lastLiterals;
-}
-
-size_t ZSTD_compressBlock_ldm(ZSTD_CCtx* ctx,
-                              const void* src, size_t srcSize)
-{
-    return ZSTD_compressBlock_ldm_generic(ctx, src, srcSize);
-}
-
-static size_t ZSTD_compressBlock_ldm_extDict_generic(
-                                 ZSTD_CCtx* ctx,
-                                 const void* src, size_t srcSize)
-{
-    ldmState_t* const ldmState = &(ctx->ldmState);
-    const ldmParams_t ldmParams = ctx->appliedParams.ldmParams;
-    const U64 hashPower = ldmState->hashPower;
-    const U32 hBits = ldmParams.hashLog - ldmParams.bucketSizeLog;
-    const U32 ldmBucketSize = ((U32)1 << ldmParams.bucketSizeLog);
-    const U32 ldmTagMask = ((U32)1 << ldmParams.hashEveryLog) - 1;
-    seqStore_t* const seqStorePtr = &(ctx->seqStore);
-    const BYTE* const base = ctx->base;
-    const BYTE* const dictBase = ctx->dictBase;
-    const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
-    const BYTE* anchor = istart;
-    const U32   lowestIndex = ctx->lowLimit;
-    const BYTE* const dictStart = dictBase + lowestIndex;
-    const U32   dictLimit = ctx->dictLimit;
-    const BYTE* const lowPrefixPtr = base + dictLimit;
-    const BYTE* const dictEnd = dictBase + dictLimit;
-    const BYTE* const iend = istart + srcSize;
-    const BYTE* const ilimit = iend - MAX(ldmParams.minMatchLength, HASH_READ_SIZE);
-
-    const ZSTD_blockCompressor blockCompressor =
-        ZSTD_selectBlockCompressor(ctx->appliedParams.cParams.strategy, 1);
-    U32* const repToConfirm = seqStorePtr->repToConfirm;
-    U32 savedRep[ZSTD_REP_NUM];
-    U64 rollingHash = 0;
-    const BYTE* lastHashed = NULL;
-    size_t i, lastLiterals;
-
-    /* Save seqStorePtr->rep and copy repToConfirm */
-    for (i = 0; i < ZSTD_REP_NUM; i++) {
-        savedRep[i] = repToConfirm[i] = seqStorePtr->rep[i];
-    }
-
-    /* Search Loop */
-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
-        size_t mLength;
-        const U32 current = (U32)(ip-base);
-        size_t forwardMatchLength = 0, backwardMatchLength = 0;
-        ldmEntry_t* bestEntry = NULL;
-        if (ip != istart) {
-          rollingHash = ZSTD_ldm_updateHash(rollingHash, lastHashed[0],
-                                       lastHashed[ldmParams.minMatchLength],
-                                       hashPower);
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         the offset against maxDist directly.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
        } else {
-            rollingHash = ZSTD_ldm_getRollingHash(ip, ldmParams.minMatchLength);
-        }
-        lastHashed = ip;
-
-        if (ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashEveryLog) !=
-                ldmTagMask) {
-            /* Don't insert and don't look for a match */
-           ip++;
-           continue;
-        }
-
-        /* Get the best entry and compute the match lengths */
-        {
-            ldmEntry_t* const bucket =
-                ZSTD_ldm_getBucket(ldmState,
-                                   ZSTD_ldm_getSmallHash(rollingHash, hBits),
-                                   ldmParams);
-            ldmEntry_t* cur;
-            size_t bestMatchLength = 0;
-            U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
-
-            for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
-                const BYTE* const curMatchBase =
-                    cur->offset < dictLimit ? dictBase : base;
-                const BYTE* const pMatch = curMatchBase + cur->offset;
-                const BYTE* const matchEnd =
-                    cur->offset < dictLimit ? dictEnd : iend;
-                const BYTE* const lowMatchPtr =
-                    cur->offset < dictLimit ? dictStart : lowPrefixPtr;
-                size_t curForwardMatchLength, curBackwardMatchLength,
-                       curTotalMatchLength;
-
-                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
-                    continue;
-                }
-
-                curForwardMatchLength = ZSTD_count_2segments(
-                                            ip, pMatch, iend,
-                                            matchEnd, lowPrefixPtr);
-                if (curForwardMatchLength < ldmParams.minMatchLength) {
-                    continue;
-                }
-                curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch(
-                                             ip, anchor, pMatch, lowMatchPtr);
-                curTotalMatchLength = curForwardMatchLength +
-                                      curBackwardMatchLength;
-
-                if (curTotalMatchLength > bestMatchLength) {
-                    bestMatchLength = curTotalMatchLength;
-                    forwardMatchLength = curForwardMatchLength;
-                    backwardMatchLength = curBackwardMatchLength;
-                    bestEntry = cur;
-                }
-            }
-        }
-
-        /* No match found -- continue searching */
-        if (bestEntry == NULL) {
-            ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
-                                             (U32)(lastHashed - base),
-                                             ldmParams);
-            ip++;
-            continue;
-        }
-
-        /* Match found */
-        mLength = forwardMatchLength + backwardMatchLength;
-        ip -= backwardMatchLength;
-
-        /* Call the block compressor on the remaining literals */
-        {
-            /* ip = current - backwardMatchLength
-             * The match is at (bestEntry->offset - backwardMatchLength) */
-            U32 const matchIndex = bestEntry->offset;
-            U32 const offset = current - matchIndex;
-
-            /* Overwrite rep codes */
-            for (i = 0; i < ZSTD_REP_NUM; i++)
-                seqStorePtr->rep[i] = repToConfirm[i];
-
-            /* Fill the hash table for the block compressor */
-            ZSTD_ldm_limitTableUpdate(ctx, anchor);
-            ZSTD_ldm_fillFastTables(ctx, anchor);
-
-            /* Call block compressor and get remaining literals  */
-            lastLiterals = blockCompressor(ctx, anchor, ip - anchor);
-            ctx->nextToUpdate = (U32)(ip - base);
-
-            /* Update repToConfirm with the new offset */
-            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
-                repToConfirm[i] = repToConfirm[i-1];
-            repToConfirm[0] = offset;
-
-            /* Store the sequence with the leftover literals */
-            ZSTD_storeSeq(seqStorePtr, lastLiterals, ip - lastLiterals,
-                          offset + ZSTD_REP_MOVE, mLength - MINMATCH);
-        }
-
-        /* Insert the current entry into the hash table */
-        ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
-                                         (U32)(lastHashed - base),
-                                         ldmParams);
-
-        /* Fill the hash table from lastHashed+1 to ip+mLength */
-        assert(ip + backwardMatchLength == lastHashed);
-        if (ip + mLength < ilimit) {
-            rollingHash = ZSTD_ldm_fillLdmHashTable(
-                              ldmState, rollingHash, lastHashed,
-                              ip + mLength, base, hBits,
-                              ldmParams);
-            lastHashed = ip + mLength - 1;
-        }
-        ip += mLength;
-        anchor = ip;
-
-        /* check immediate repcode */
-        while (ip < ilimit) {
-            U32 const current2 = (U32)(ip-base);
-            U32 const repIndex2 = current2 - repToConfirm[1];
-            const BYTE* repMatch2 = repIndex2 < dictLimit ?
-                                    dictBase + repIndex2 : base + repIndex2;
-            if ( (((U32)((dictLimit-1) - repIndex2) >= 3) &
-                        (repIndex2 > lowestIndex))  /* intentional overflow */
-               && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                const BYTE* const repEnd2 = repIndex2 < dictLimit ?
-                                            dictEnd : iend;
-                size_t const repLength2 =
-                        ZSTD_count_2segments(ip+4, repMatch2+4, iend,
-                                             repEnd2, lowPrefixPtr) + 4;
-
-                U32 tmpOffset = repToConfirm[1];
-                repToConfirm[1] = repToConfirm[0];
-                repToConfirm[0] = tmpOffset;
-
-                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
-
-                /* Fill the  hash table from lastHashed+1 to ip+repLength2*/
-                if (ip + repLength2 < ilimit) {
-                    rollingHash = ZSTD_ldm_fillLdmHashTable(
-                                      ldmState, rollingHash, lastHashed,
-                                      ip + repLength2, base, hBits,
-                                      ldmParams);
-                    lastHashed = ip + repLength2 - 1;
-                }
-                ip += repLength2;
-                anchor = ip;
-                continue;
-            }
-            break;
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
        }
    }
-
-    /* Overwrite rep */
-    for (i = 0; i < ZSTD_REP_NUM; i++)
-        seqStorePtr->rep[i] = repToConfirm[i];
-
-    ZSTD_ldm_limitTableUpdate(ctx, anchor);
-    ZSTD_ldm_fillFastTables(ctx, anchor);
-
-    /* Call the block compressor one last time on the last literals */
-    lastLiterals = blockCompressor(ctx, anchor, iend - anchor);
-    ctx->nextToUpdate = (U32)(iend - base);
-
-    /* Restore seqStorePtr->rep */
-    for (i = 0; i < ZSTD_REP_NUM; i++)
-        seqStorePtr->rep[i] = savedRep[i];
-
-    /* Return the last literals size */
-    return lastLiterals;
+    return 0;
 }

-size_t ZSTD_compressBlock_ldm_extDict(ZSTD_CCtx* ctx,
-                                      const void* src, size_t srcSize)
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
 {
-    return ZSTD_compressBlock_ldm_extDict_generic(ctx, src, srcSize);
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
+    int const extDict)
+{
+    unsigned const minMatch = cParams->searchLength;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, extDict);
+    BYTE const* const base = ms->window.base;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the lits */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        int i;
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(sequence.offset <= (1U << cParams->windowLog));
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, cParams, ip);
+        /* Run the block compressor */
+        {
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, cParams, ip,
+                                sequence.litLength);
+            ip += sequence.litLength;
+            ms->nextToUpdate = (U32)(ip - base);
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength,
+                          sequence.offset + ZSTD_REP_MOVE,
+                          sequence.matchLength - MINMATCH);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, cParams, ip);
+    /* Compress the last literals */
+    {
+        size_t const lastLiterals = blockCompressor(ms, seqStore, rep, cParams,
+                                                    ip, iend - ip);
+        ms->nextToUpdate = (U32)(iend - base);
+        return lastLiterals;
+    }
 }
--- a/sys/contrib/zstd/lib/compress/zstd_ldm.h
+++ b/sys/contrib/zstd/lib/compress/zstd_ldm.h
@ -22,32 +22,71 @@ extern "C" {
 ***************************************/

 #define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_DEFAULTMAX
-#define ZSTD_LDM_HASHEVERYLOG_NOTSET 9999

-/** ZSTD_compressBlock_ldm_generic() :
+/**
+ * ZSTD_ldm_generateSequences():
 *
- *  This is a block compressor intended for long distance matching.
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
 *
- *  The function searches for matches of length at least
- *  ldmParams.minMatchLength using a hash table in cctx->ldmState.
- *  Matches can be at a distance of up to cParams.windowLog.
- *
- *  Upon finding a match, the unmatched literals are compressed using a
- *  ZSTD_blockCompressor (depending on the strategy in the compression
- *  parameters), which stores the matched sequences. The "long distance"
- *  match is then stored with the remaining literals from the
- *  ZSTD_blockCompressor. */
-size_t ZSTD_compressBlock_ldm(ZSTD_CCtx* cctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_ldm_extDict(ZSTD_CCtx* ctx,
-                                      const void* src, size_t srcSize);
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ *       sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_compressionParameters const* cParams,
+            void const* src, size_t srcSize,
+            int const extDict);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+    U32 const minMatch);

-/** ZSTD_ldm_initializeParameters() :
- *  Initialize the long distance matching parameters to their default values. */
-size_t ZSTD_ldm_initializeParameters(ldmParams_t* params, U32 enableLdm);

 /** ZSTD_ldm_getTableSize() :
- *  Estimate the space needed for long distance matching tables. */
-size_t ZSTD_ldm_getTableSize(U32 hashLog, U32 bucketSizeLog);
+ *  Estimate the space needed for long distance matching tables or 0 if LDM is
+ *  disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ *  Return an upper bound on the number of sequences that can be produced by
+ *  the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);

 /** ZSTD_ldm_getTableSize() :
 *  Return prime8bytes^(minMatchLength-1) */
@ -58,8 +97,12 @@ U64 ZSTD_ldm_getHashPower(U32 minMatchLength);
 *  windowLog and params->hashLog.
 *
 *  Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
- *  params->hashLog if it is not). */
-void ZSTD_ldm_adjustParameters(ldmParams_t* params, U32 windowLog);
+ *  params->hashLog if it is not).
+ *
+ *  Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams);

 #if defined (__cplusplus)
 }
--- a/sys/contrib/zstd/lib/compress/zstd_opt.c
+++ b/sys/contrib/zstd/lib/compress/zstd_opt.c
@ -10,7 +10,6 @@

 #include "zstd_compress_internal.h"
 #include "zstd_opt.h"
-#include "zstd_lazy.h"   /* ZSTD_updateTree, ZSTD_updateTree_extDict */


 #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */
@ -244,14 +243,15 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)

 /* Update hashTable3 up to ip (excluded)
   Assumption : always within prefix (i.e. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_CCtx* const cctx, const BYTE* const ip)
+static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, const BYTE* const ip)
 {
-    U32* const hashTable3  = cctx->hashTable3;
-    U32 const hashLog3  = cctx->hashLog3;
-    const BYTE* const base = cctx->base;
-    U32 idx = cctx->nextToUpdate3;
-    U32 const target = cctx->nextToUpdate3 = (U32)(ip - base);
+    U32* const hashTable3 = ms->hashTable3;
+    U32 const hashLog3 = ms->hashLog3;
+    const BYTE* const base = ms->window.base;
+    U32 idx = ms->nextToUpdate3;
+    U32 const target = ms->nextToUpdate3 = (U32)(ip - base);
    size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+    assert(hashLog3 > 0);

    while(idx < target) {
        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
@ -265,36 +265,173 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_CCtx* const cctx, const BYTE*
 /*-*************************************
 *  Binary Tree search
 ***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ *  ip : assumed <= iend-8 .
+ * @return : nb of positions added */
+static U32 ZSTD_insertBt1(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* const ip, const BYTE* const iend,
+                U32 const mls, U32 const extDict)
+{
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 current = (U32)(ip-base);
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = ms->window.lowLimit;
+    U32 matchEndIdx = current+8+1;
+    size_t bestLength = 8;
+    U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current);
+
+    assert(ip <= iend-8);   /* required for h calculation */
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if extDict is incorrectly set to 0 */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    if (bestLength > 384) return MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+    assert(matchEndIdx > current + 8);
+    return matchEndIdx - (current + 8);
+}
+
+FORCE_INLINE_TEMPLATE
+void ZSTD_updateTree_internal(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* const ip, const BYTE* const iend,
+                const U32 mls, const U32 extDict)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (extDict:%u)",
+                idx, target, extDict);
+
+    while(idx < target)
+        idx += ZSTD_insertBt1(ms, cParams, base+idx, iend, mls, extDict);
+    ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(
+                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+                const BYTE* ip, const BYTE* iend)
+{
+    ZSTD_updateTree_internal(ms, cParams, ip, iend, cParams->searchLength, 0 /*extDict*/);
+}
+
 FORCE_INLINE_TEMPLATE
 U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_CCtx* zc,
+                    ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                    const BYTE* const ip, const BYTE* const iLimit, int const extDict,
-                    U32 nbCompares, U32 const mls, U32 const sufficient_len,
                    U32 rep[ZSTD_REP_NUM], U32 const ll0,
-                    ZSTD_match_t* matches, const U32 lengthToBeat)
+                    ZSTD_match_t* matches, const U32 lengthToBeat, U32 const mls /* template */)
 {
-    const BYTE* const base = zc->base;
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    const BYTE* const base = ms->window.base;
    U32 const current = (U32)(ip-base);
-    U32 const hashLog = zc->appliedParams.cParams.hashLog;
+    U32 const hashLog = cParams->hashLog;
    U32 const minMatch = (mls==3) ? 3 : 4;
-    U32* const hashTable = zc->hashTable;
+    U32* const hashTable = ms->hashTable;
    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
    U32 matchIndex  = hashTable[h];
-    U32* const bt   = zc->chainTable;
-    U32 const btLog = zc->appliedParams.cParams.chainLog - 1;
+    U32* const bt   = ms->chainTable;
+    U32 const btLog = cParams->chainLog - 1;
    U32 const btMask= (1U << btLog) - 1;
    size_t commonLengthSmaller=0, commonLengthLarger=0;
-    const BYTE* const dictBase = zc->dictBase;
-    U32 const dictLimit = zc->dictLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    U32 const dictLimit = ms->window.dictLimit;
    const BYTE* const dictEnd = dictBase + dictLimit;
    const BYTE* const prefixStart = base + dictLimit;
    U32 const btLow = btMask >= current ? 0 : current - btMask;
-    U32 const windowLow = zc->lowLimit;
+    U32 const windowLow = ms->window.lowLimit;
    U32* smallerPtr = bt + 2*(current&btMask);
    U32* largerPtr  = bt + 2*(current&btMask) + 1;
    U32 matchEndIdx = current+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
    U32 dummy32;   /* to be nullified at the end */
    U32 mnum = 0;
+    U32 nbCompares = 1U << cParams->searchLog;

    size_t bestLength = lengthToBeat-1;
    DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches");
@ -335,7 +472,7 @@ U32 ZSTD_insertBtAndGetAllMatches (

    /* HC3 match finder */
    if ((mls == 3) /*static*/ && (bestLength < mls)) {
-        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3 (zc, ip);
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, ip);
        if ((matchIndex3 > windowLow)
          & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
            size_t mlen;
@ -359,7 +496,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                mnum = 1;
                if ( (mlen > sufficient_len) |
                     (ip+mlen == iLimit) ) {  /* best possible length */
-                    zc->nextToUpdate = current+1;  /* skip insertion */
+                    ms->nextToUpdate = current+1;  /* skip insertion */
                    return 1;
    }   }   }   }

@ -416,30 +553,29 @@ U32 ZSTD_insertBtAndGetAllMatches (
    *smallerPtr = *largerPtr = 0;

    assert(matchEndIdx > current+8);
-    zc->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
+    ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
    return mnum;
 }


 FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
-                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                        const BYTE* ip, const BYTE* const iHighLimit, int const extDict,
-                        U32 const maxNbAttempts, U32 const matchLengthSearch, U32 const sufficient_len,
                        U32 rep[ZSTD_REP_NUM], U32 const ll0,
                        ZSTD_match_t* matches, U32 const lengthToBeat)
 {
+    U32 const matchLengthSearch = cParams->searchLength;
    DEBUGLOG(7, "ZSTD_BtGetAllMatches");
-    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
-    if (extDict) ZSTD_updateTree_extDict(zc, ip, iHighLimit, maxNbAttempts, matchLengthSearch);
-    else ZSTD_updateTree(zc, ip, iHighLimit, maxNbAttempts, matchLengthSearch);
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, cParams, ip, iHighLimit, matchLengthSearch, extDict);
    switch(matchLengthSearch)
    {
-    case 3 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 3, sufficient_len, rep, ll0, matches, lengthToBeat);
+    case 3 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 3);
    default :
-    case 4 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 4, sufficient_len, rep, ll0, matches, lengthToBeat);
-    case 5 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 5, sufficient_len, rep, ll0, matches, lengthToBeat);
+    case 4 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 5);
    case 7 :
-    case 6 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 6, sufficient_len, rep, ll0, matches, lengthToBeat);
+    case 6 : return ZSTD_insertBtAndGetAllMatches(ms, cParams, ip, iHighLimit, extDict, rep, ll0, matches, lengthToBeat, 6);
    }
 }

@ -527,36 +663,33 @@ static int ZSTD_literalsContribution_cached(
 }

 FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
+size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,seqStore_t* seqStore,
+                                      U32 rep[ZSTD_REP_NUM],
+                                      ZSTD_compressionParameters const* cParams,
                                      const void* src, size_t srcSize,
                                      const int optLevel, const int extDict)
 {
-    seqStore_t* const seqStorePtr = &(ctx->seqStore);
-    optState_t* const optStatePtr = &(ctx->optState);
+    optState_t* const optStatePtr = &ms->opt;
    const BYTE* const istart = (const BYTE*)src;
    const BYTE* ip = istart;
    const BYTE* anchor = istart;
    const BYTE* const iend = istart + srcSize;
    const BYTE* const ilimit = iend - 8;
-    const BYTE* const base = ctx->base;
-    const BYTE* const prefixStart = base + ctx->dictLimit;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;

-    U32 const maxSearches = 1U << ctx->appliedParams.cParams.searchLog;
-    U32 const sufficient_len = MIN(ctx->appliedParams.cParams.targetLength, ZSTD_OPT_NUM -1);
-    U32 const mls = ctx->appliedParams.cParams.searchLength;
-    U32 const minMatch = (ctx->appliedParams.cParams.searchLength == 3) ? 3 : 4;
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    U32 const minMatch = (cParams->searchLength == 3) ? 3 : 4;

    ZSTD_optimal_t* const opt = optStatePtr->priceTable;
    ZSTD_match_t* const matches = optStatePtr->matchTable;
    cachedLiteralPrice_t cachedLitPrice;
-    U32 rep[ZSTD_REP_NUM];

    /* init */
    DEBUGLOG(5, "ZSTD_compressBlock_opt_generic");
-    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    ms->nextToUpdate3 = ms->nextToUpdate;
    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize);
    ip += (ip==prefixStart);
-    { int i; for (i=0; i<ZSTD_REP_NUM; i++) rep[i]=seqStorePtr->rep[i]; }
    memset(&cachedLitPrice, 0, sizeof(cachedLitPrice));

    /* Match Loop */
@ -567,7 +700,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
        /* find first match */
        {   U32 const litlen = (U32)(ip - anchor);
            U32 const ll0 = !litlen;
-            U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, ip, iend, extDict, maxSearches, mls, sufficient_len, rep, ll0, matches, minMatch);
+            U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, ip, iend, extDict, rep, ll0, matches, minMatch);
            if (!nbMatches) { ip++; continue; }

            /* initialize opt[0] */
@ -653,7 +786,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
                U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0;
                U32 const previousPrice = (cur > litlen) ? opt[cur-litlen].price : 0;
                U32 const basePrice = previousPrice + ZSTD_fullLiteralsCost(inr-litlen, litlen, optStatePtr);
-                U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, inr, iend, extDict, maxSearches, mls, sufficient_len, opt[cur].rep, ll0, matches, minMatch);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(ms, cParams, inr, iend, extDict, opt[cur].rep, ll0, matches, minMatch);
                U32 matchNb;
                if (!nbMatches) continue;

@ -749,37 +882,42 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
                }

                ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen);
-                ZSTD_storeSeq(seqStorePtr, llen, anchor, offset, mlen-MINMATCH);
+                ZSTD_storeSeq(seqStore, llen, anchor, offset, mlen-MINMATCH);
                anchor = ip;
        }   }
        ZSTD_setLog2Prices(optStatePtr);
    }   /* while (ip < ilimit) */

-    /* Save reps for next block */
-    { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqStorePtr->repToConfirm[i] = rep[i]; }
-
    /* Return the last literals size */
    return iend - anchor;
 }


-size_t ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
    DEBUGLOG(5, "ZSTD_compressBlock_btopt");
-    return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/);
 }

-size_t ZSTD_compressBlock_btultra(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/);
 }

-size_t ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/);
 }

-size_t ZSTD_compressBlock_btultra_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, cParams, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/);
 }
--- a/sys/contrib/zstd/lib/compress/zstd_opt.h
+++ b/sys/contrib/zstd/lib/compress/zstd_opt.h
@ -15,13 +15,25 @@
 extern "C" {
 #endif

-#include "zstd.h"   /* ZSTD_CCtx, size_t */
+#include "zstd_compress_internal.h"

-size_t ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
+void ZSTD_updateTree(
+        ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
+        const BYTE* ip, const BYTE* iend);  /* used in ZSTD_loadDictionaryContent() */

-size_t ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize);
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize);

 #if defined (__cplusplus)
 }
--- a/sys/contrib/zstd/lib/compress/zstdmt_compress.c
+++ b/sys/contrib/zstd/lib/compress/zstdmt_compress.c
--- a/sys/contrib/zstd/lib/compress/zstdmt_compress.h
+++ b/sys/contrib/zstd/lib/compress/zstdmt_compress.h
@ -30,15 +30,15 @@

 /* ===   Memory management   === */
 typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
-ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbThreads);
-ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbThreads,
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
                                                    ZSTD_customMem cMem);
 ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);

 ZSTDLIB_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);


-/* ===   Simple buffer-to-butter one-pass function   === */
+/* ===   Simple one-pass compression function   === */

 ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
                                       void* dst, size_t dstCapacity,
@ -50,7 +50,7 @@ ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
 /* ===   Streaming functions   === */

 ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
-ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize);  /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it may change in the future, to mean "empty" */
+ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize);  /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */

 ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);

@ -68,7 +68,7 @@ ZSTDLIB_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
                                           void* dst, size_t dstCapacity,
                                     const void* src, size_t srcSize,
                                     const ZSTD_CDict* cdict,
-                                           ZSTD_parameters const params,
+                                           ZSTD_parameters params,
                                           unsigned overlapLog);

 ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
@ -85,7 +85,7 @@ ZSTDLIB_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
 * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
 typedef enum {
    ZSTDMT_p_jobSize,           /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
-    ZSTDMT_p_overlapSectionLog  /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window */
+    ZSTDMT_p_overlapSectionLog  /* Each job may reload a part of previous job to enhance compressionr ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
 } ZSTDMT_parameter;

 /* ZSTDMT_setMTCtxParameter() :
@ -97,30 +97,46 @@ ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter


 /*! ZSTDMT_compressStream_generic() :
- *  Combines ZSTDMT_compressStream() with ZSTDMT_flushStream() or ZSTDMT_endStream()
+ *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
 *  depending on flush directive.
 * @return : minimum amount of data still to be flushed
 *           0 if fully flushed
- *           or an error code */
+ *           or an error code
+ *  note : needs to be init using any ZSTD_initCStream*() variant */
 ZSTDLIB_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
                                                ZSTD_outBuffer* output,
                                                ZSTD_inBuffer* input,
                                                ZSTD_EndDirective endOp);


-/* ===   Private definitions; never ever use directly  === */
+/* ========================================================
+ * ===  Private interface, for use by ZSTD_compress.c   ===
+ * ===  Not exposed in libzstd. Never invoke directly   ===
+ * ======================================================== */

 size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, unsigned value);

-/* ZSTDMT_CCtxParam_setNbThreads()
- * Set nbThreads, and clamp it correctly,
- * also reset jobSize and overlapLog */
-size_t ZSTDMT_CCtxParam_setNbThreads(ZSTD_CCtx_params* params, unsigned nbThreads);
+/* ZSTDMT_CCtxParam_setNbWorkers()
+ * Set nbWorkers, and clamp it.
+ * Also reset jobSize and overlapLog */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);

-/* ZSTDMT_getNbThreads():
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/* ZSTDMT_getNbWorkers():
 * @return nb threads currently active in mtctx.
 * mtctx must be valid */
-size_t ZSTDMT_getNbThreads(const ZSTDMT_CCtx* mtctx);
+unsigned ZSTDMT_getNbWorkers(const ZSTDMT_CCtx* mtctx);
+
+/* ZSTDMT_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+

 /*! ZSTDMT_initCStream_internal() :
 *  Private use only. Init streaming operation.
@ -128,7 +144,7 @@ size_t ZSTDMT_getNbThreads(const ZSTDMT_CCtx* mtctx);
 *  must receive dict, or cdict, or none, but not both.
 *  @return : 0, or an error code */
 size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
-                    const void* dict, size_t dictSize, ZSTD_dictMode_e dictMode,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
                    const ZSTD_CDict* cdict,
                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);

--- a/sys/contrib/zstd/lib/decompress/huf_decompress.c
+++ b/sys/contrib/zstd/lib/decompress/huf_decompress.c
@ -49,18 +49,19 @@
 ****************************************************************/
 #define HUF_isError ERR_isError
 #define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; }


 /* **************************************************************
 *  Byte alignment for workSpace management
 ****************************************************************/
-#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
 #define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))

+
 /*-***************************/
 /*  generic DTableDesc       */
 /*-***************************/
-
 typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;

 static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
@ -74,7 +75,6 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
 /*-***************************/
 /*  single-symbol decoding   */
 /*-***************************/
-
 typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */

 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
@ -94,10 +94,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
    huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;

-    if ((spaceUsed32 << 2) > wkspSize)
-        return ERROR(tableLog_tooLarge);
-    workSpace = (U32 *)workSpace + spaceUsed32;
-    wkspSize -= (spaceUsed32 << 2);
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);

    HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
    /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
@ -144,8 +141,10 @@ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
                                 workSpace, sizeof(workSpace));
 }

+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */

-static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
 {
    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
    BYTE const c = dt[val].byte;
@ -156,7 +155,7 @@ static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, con
 #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)

-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)  \
    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)

@ -164,30 +163,33 @@ static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, con
    if (MEM_64bits()) \
        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)

-HINT_INLINE size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
 {
    BYTE* const pStart = p;

    /* up to 4 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) {
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
    }

-    /* closer to the end */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);

-    /* no more data to retrieve from bitstream, hence no need to reload */
+    /* no more data to retrieve from bitstream, no need to reload */
    while (p < pEnd)
        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);

    return pEnd-pStart;
 }

-static size_t HUF_decompress1X2_usingDTable_internal(
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
          void* dst,  size_t dstSize,
    const void* cSrc, size_t cSrcSize,
    const HUF_DTable* DTable)
@ -200,58 +202,17 @@ static size_t HUF_decompress1X2_usingDTable_internal(
    DTableDesc const dtd = HUF_getDTableDesc(DTable);
    U32 const dtLog = dtd.tableLog;

-    { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
-      if (HUF_isError(errorCode)) return errorCode; }
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );

    HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);

-    /* check */
    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);

    return dstSize;
 }

-size_t HUF_decompress1X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
-}
-
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    const BYTE* ip = (const BYTE*) cSrc;
-
-    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
-    if (HUF_isError(hSize)) return hSize;
-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
-    ip += hSize; cSrcSize -= hSize;
-
-    return HUF_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
-}
-
-
-size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                              const void* cSrc, size_t cSrcSize)
-{
-    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
-    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
-                                       workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
-    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
-    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
-}
-
-
-static size_t HUF_decompress4X2_usingDTable_internal(
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
          void* dst,  size_t dstSize,
    const void* cSrc, size_t cSrcSize,
    const HUF_DTable* DTable)
@ -286,23 +247,19 @@ static size_t HUF_decompress4X2_usingDTable_internal(
        BYTE* op2 = opStart2;
        BYTE* op3 = opStart3;
        BYTE* op4 = opStart4;
-        U32 endSignal;
+        U32 endSignal = BIT_DStream_unfinished;
        DTableDesc const dtd = HUF_getDTableDesc(DTable);
        U32 const dtLog = dtd.tableLog;

        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
-          if (HUF_isError(errorCode)) return errorCode; }
-        { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
-          if (HUF_isError(errorCode)) return errorCode; }
-        { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
-          if (HUF_isError(errorCode)) return errorCode; }
-        { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
-          if (HUF_isError(errorCode)) return errorCode; }
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );

-        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) {
+        while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
@ -319,10 +276,15 @@ static size_t HUF_decompress4X2_usingDTable_internal(
            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+            BIT_reloadDStream(&bitD1);
+            BIT_reloadDStream(&bitD2);
+            BIT_reloadDStream(&bitD3);
+            BIT_reloadDStream(&bitD4);
        }

        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
        if (op1 > opStart2) return ERROR(corruption_detected);
        if (op2 > opStart3) return ERROR(corruption_detected);
        if (op3 > opStart4) return ERROR(corruption_detected);
@ -335,8 +297,8 @@ static size_t HUF_decompress4X2_usingDTable_internal(
        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);

        /* check */
-        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-        if (!endSignal) return ERROR(corruption_detected);
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }

        /* decoded size */
        return dstSize;
@ -344,6 +306,279 @@ static size_t HUF_decompress4X2_usingDTable_internal(
 }


+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X4_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X4_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+                                               const void *cSrc,
+                                               size_t cSrcSize,
+                                               const HUF_DTable *DTable);
+#if DYNAMIC_BMI2
+
+#define X(fn)                                                               \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                       \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        if (bmi2) {                                                         \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define X(fn)                                                               \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        (void)bmi2;                                                         \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+X(HUF_decompress1X2_usingDTable_internal)
+X(HUF_decompress4X2_usingDTable_internal)
+X(HUF_decompress1X4_usingDTable_internal)
+X(HUF_decompress4X4_usingDTable_internal)
+
+#undef X
+
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
 size_t HUF_decompress4X2_usingDTable(
          void* dst,  size_t dstSize,
    const void* cSrc, size_t cSrcSize,
@ -351,13 +586,12 @@ size_t HUF_decompress4X2_usingDTable(
 {
    DTableDesc dtd = HUF_getDTableDesc(DTable);
    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }

-
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
+                                   void* workSpace, size_t wkspSize, int bmi2)
 {
    const BYTE* ip = (const BYTE*) cSrc;

@ -367,7 +601,14 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
    ip += hSize; cSrcSize -= hSize;

-    return HUF_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
 }


@ -387,8 +628,6 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS
 /* *************************/
 /* double-symbols decoding */
 /* *************************/
-typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
-
 typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;

 /* HUF_fillDTableX4Level2() :
@ -508,10 +747,7 @@ size_t HUF_readDTableX4_wksp(HUF_DTable* DTable, const void* src,
    weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;

-    if ((spaceUsed32 << 2) > wkspSize)
-        return ERROR(tableLog_tooLarge);
-    workSpace = (U32 *)workSpace + spaceUsed32;
-    wkspSize -= (spaceUsed32 << 2);
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);

    rankStart = rankStart0 + 1;
    memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
@ -588,95 +824,6 @@ size_t HUF_readDTableX4(HUF_DTable* DTable, const void* src, size_t srcSize)
                               workSpace, sizeof(workSpace));
 }

-static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
-{
-    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
-    memcpy(op, dt+val, 2);
-    BIT_skipBits(DStream, dt[val].nbBits);
-    return dt[val].length;
-}
-
-static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
-{
-    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
-    memcpy(op, dt+val, 1);
-    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
-    else {
-        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
-            BIT_skipBits(DStream, dt[val].nbBits);
-            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
-                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
-                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
-    }   }
-    return 1;
-}
-
-
-#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
-    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
-    if (MEM_64bits()) \
-        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
-
-HINT_INLINE size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
-{
-    BYTE* const pStart = p;
-
-    /* up to 8 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
-        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
-    }
-
-    /* closer to end : up to 2 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
-
-    while (p <= pEnd-2)
-        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
-
-    if (p < pEnd)
-        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
-
-    return p-pStart;
-}
-
-
-static size_t HUF_decompress1X4_usingDTable_internal(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    BIT_DStream_t bitD;
-
-    /* Init */
-    {   size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
-        if (HUF_isError(errorCode)) return errorCode;
-    }
-
-    /* decode */
-    {   BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
-        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
-        DTableDesc const dtd = HUF_getDTableDesc(DTable);
-        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
-    }
-
-    /* check */
-    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
-
-    /* decoded size */
-    return dstSize;
-}
-
 size_t HUF_decompress1X4_usingDTable(
          void* dst,  size_t dstSize,
    const void* cSrc, size_t cSrcSize,
@ -684,7 +831,7 @@ size_t HUF_decompress1X4_usingDTable(
 {
    DTableDesc dtd = HUF_getDTableDesc(DTable);
    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }

 size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
@ -699,7 +846,7 @@ size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
    ip += hSize; cSrcSize -= hSize;

-    return HUF_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
 }


@ -717,99 +864,6 @@ size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cS
    return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
 }

-static size_t HUF_decompress4X4_usingDTable_internal(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
-
-    {   const BYTE* const istart = (const BYTE*) cSrc;
-        BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
-        const void* const dtPtr = DTable+1;
-        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
-
-        /* Init */
-        BIT_DStream_t bitD1;
-        BIT_DStream_t bitD2;
-        BIT_DStream_t bitD3;
-        BIT_DStream_t bitD4;
-        size_t const length1 = MEM_readLE16(istart);
-        size_t const length2 = MEM_readLE16(istart+2);
-        size_t const length3 = MEM_readLE16(istart+4);
-        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
-        const BYTE* const istart1 = istart + 6;  /* jumpTable */
-        const BYTE* const istart2 = istart1 + length1;
-        const BYTE* const istart3 = istart2 + length2;
-        const BYTE* const istart4 = istart3 + length3;
-        size_t const segmentSize = (dstSize+3) / 4;
-        BYTE* const opStart2 = ostart + segmentSize;
-        BYTE* const opStart3 = opStart2 + segmentSize;
-        BYTE* const opStart4 = opStart3 + segmentSize;
-        BYTE* op1 = ostart;
-        BYTE* op2 = opStart2;
-        BYTE* op3 = opStart3;
-        BYTE* op4 = opStart4;
-        U32 endSignal;
-        DTableDesc const dtd = HUF_getDTableDesc(DTable);
-        U32 const dtLog = dtd.tableLog;
-
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
-          if (HUF_isError(errorCode)) return errorCode; }
-        { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
-          if (HUF_isError(errorCode)) return errorCode; }
-        { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
-          if (HUF_isError(errorCode)) return errorCode; }
-        { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
-          if (HUF_isError(errorCode)) return errorCode; }
-
-        /* 16-32 symbols per loop (4-8 symbols per stream) */
-        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
-            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
-
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        }
-
-        /* check corruption */
-        if (op1 > opStart2) return ERROR(corruption_detected);
-        if (op2 > opStart3) return ERROR(corruption_detected);
-        if (op3 > opStart4) return ERROR(corruption_detected);
-        /* note : op4 already verified within main loop */
-
-        /* finish bitStreams one by one */
-        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
-        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
-        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
-        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
-
-        /* check */
-        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
-          if (!endCheck) return ERROR(corruption_detected); }
-
-        /* decoded size */
-        return dstSize;
-    }
-}
-
-
 size_t HUF_decompress4X4_usingDTable(
          void* dst,  size_t dstSize,
    const void* cSrc, size_t cSrcSize,
@ -817,13 +871,12 @@ size_t HUF_decompress4X4_usingDTable(
 {
    DTableDesc dtd = HUF_getDTableDesc(DTable);
    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }

-
-size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X4_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
+                                   void* workSpace, size_t wkspSize, int bmi2)
 {
    const BYTE* ip = (const BYTE*) cSrc;

@ -833,7 +886,14 @@ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
    ip += hSize; cSrcSize -= hSize;

-    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
 }


@ -861,8 +921,8 @@ size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
                                    const HUF_DTable* DTable)
 {
    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
-                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }

 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
@ -870,8 +930,8 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
                                    const HUF_DTable* DTable)
 {
    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
-                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 }


@ -898,21 +958,22 @@ static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, qu
 };

 /** HUF_selectDecoder() :
-*   Tells which decoder is likely to decode faster,
-*   based on a set of pre-determined metrics.
-*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
-*   Assumption : 0 < cSrcSize, dstSize <= 128 KB */
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+ *  Assumption : 0 < dstSize <= 128 KB */
 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 {
+    assert(dstSize > 0);
+    assert(dstSize <= 128 KB);
    /* decoder timing evaluation */
-    U32 const Q = cSrcSize >= dstSize ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
-    U32 const D256 = (U32)(dstSize >> 8);
-    U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
-    U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
-    DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, for cache eviction */
-
-    return DTime1 < DTime0;
-}
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+}   }


 typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
@ -994,3 +1055,42 @@ size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
                                      workSpace, sizeof(workSpace));
 }
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    }
+}
--- a/sys/contrib/zstd/lib/decompress/zstd_decompress.c
+++ b/sys/contrib/zstd/lib/decompress/zstd_decompress.c
--- a/sys/contrib/zstd/lib/dictBuilder/cover.c
+++ b/sys/contrib/zstd/lib/dictBuilder/cover.c
@ -537,8 +537,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  /* Checks */
  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
      totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
-    DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
-                 (COVER_MAX_SAMPLES_SIZE >> 20));
+    DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                 (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
    return 0;
  }
  /* Zero the context */
@ -651,12 +651,16 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
 }

 ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
-    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples,
-    ZDICT_cover_params_t parameters) {
-  BYTE *const dict = (BYTE *)dictBuffer;
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters)
+{
+  BYTE* const dict = (BYTE*)dictBuffer;
  COVER_ctx_t ctx;
  COVER_map_t activeDmers;
+
+  /* Initialize global data */
+  g_displayLevel = parameters.zParams.notificationLevel;
  /* Checks */
  if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
    DISPLAYLEVEL(1, "Cover parameters incorrect\n");
@ -671,8 +675,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
                 ZDICT_DICTSIZE_MIN);
    return ERROR(dstSize_tooSmall);
  }
-  /* Initialize global data */
-  g_displayLevel = parameters.zParams.notificationLevel;
  /* Initialize context and activeDmers */
  if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
                      parameters.d)) {
@ -947,6 +949,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  unsigned k;
  COVER_best_t best;
  POOL_ctx *pool = NULL;
+
  /* Checks */
  if (kMinK < kMaxD || kMaxK < kMinK) {
    LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
--- a/sys/contrib/zstd/lib/dictBuilder/zdict.c
+++ b/sys/contrib/zstd/lib/dictBuilder/zdict.c
@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
    U32 cumulLength[LLIMIT] = {0};
    U32 savings[LLIMIT] = {0};
    const BYTE* b = (const BYTE*)buffer;
-    size_t length;
    size_t maxLength = LLIMIT;
    size_t pos = suffix[start];
    U32 end = start;
@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
       ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
       ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
        /* skip and mark segment */
-        U16 u16 = MEM_read16(b+pos+4);
-        U32 u, e = 6;
-        while (MEM_read16(b+pos+e) == u16) e+=2 ;
-        if (b[pos+e] == b[pos+e-1]) e++;
-        for (u=1; u<e; u++)
+        U16 const pattern16 = MEM_read16(b+pos+4);
+        U32 u, patternEnd = 6;
+        while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
+        if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
+        for (u=1; u<patternEnd; u++)
            doneMarks[pos+u] = 1;
        return solution;
    }

    /* look forward */
-    do {
-        end++;
-        length = ZDICT_count(b + pos, b + suffix[end]);
-    } while (length >=MINMATCHLENGTH);
+    {   size_t length;
+        do {
+            end++;
+            length = ZDICT_count(b + pos, b + suffix[end]);
+        } while (length >= MINMATCHLENGTH);
+    }

    /* look backward */
-    do {
-        length = ZDICT_count(b + pos, b + *(suffix+start-1));
-        if (length >=MINMATCHLENGTH) start--;
-    } while(length >= MINMATCHLENGTH);
+    {   size_t length;
+        do {
+            length = ZDICT_count(b + pos, b + *(suffix+start-1));
+            if (length >=MINMATCHLENGTH) start--;
+        } while(length >= MINMATCHLENGTH);
+    }

    /* exit if not found a minimum nb of repetitions */
    if (end-start < minRatio) {
@ -268,7 +271,7 @@ static dictItem ZDICT_analyzePos(
            U32 selectedCount = 0;
            U32 selectedID = currentID;
            for (id =refinedStart; id < refinedEnd; id++) {
-                if (b[ suffix[id] + searchLength] != currentChar) {
+                if (b[suffix[id] + searchLength] != currentChar) {
                    if (currentCount > selectedCount) {
                        selectedCount = currentCount;
                        selectedID = currentID;
@ -297,20 +300,23 @@ static dictItem ZDICT_analyzePos(
        memset(lengthList, 0, sizeof(lengthList));

        /* look forward */
-        do {
-            end++;
-            length = ZDICT_count(b + pos, b + suffix[end]);
-            if (length >= LLIMIT) length = LLIMIT-1;
-            lengthList[length]++;
-        } while (length >=MINMATCHLENGTH);
+        {   size_t length;
+            do {
+                end++;
+                length = ZDICT_count(b + pos, b + suffix[end]);
+                if (length >= LLIMIT) length = LLIMIT-1;
+                lengthList[length]++;
+            } while (length >=MINMATCHLENGTH);
+        }

        /* look backward */
-        length = MINMATCHLENGTH;
-        while ((length >= MINMATCHLENGTH) & (start > 0)) {
-            length = ZDICT_count(b + pos, b + suffix[start - 1]);
-            if (length >= LLIMIT) length = LLIMIT - 1;
-            lengthList[length]++;
-            if (length >= MINMATCHLENGTH) start--;
+        {   size_t length = MINMATCHLENGTH;
+            while ((length >= MINMATCHLENGTH) & (start > 0)) {
+                length = ZDICT_count(b + pos, b + suffix[start - 1]);
+                if (length >= LLIMIT) length = LLIMIT - 1;
+                lengthList[length]++;
+                if (length >= MINMATCHLENGTH) start--;
+            }
        }

        /* largest useful length */
@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
        /* mark positions done */
        {   U32 id;
            for (id=start; id<end; id++) {
-                U32 p, pEnd;
+                U32 p, pEnd, length;
                U32 const testedPos = suffix[id];
                if (testedPos == pos)
                    length = solution.length;
                else {
-                    length = ZDICT_count(b+pos, b+testedPos);
+                    length = (U32)ZDICT_count(b+pos, b+testedPos);
                    if (length > solution.length) length = solution.length;
                }
                pEnd = (U32)(testedPos + length);
@ -575,29 +581,30 @@ static void ZDICT_fillNoise(void* buffer, size_t length)

 typedef struct
 {
-    ZSTD_CCtx* ref;
-    ZSTD_CCtx* zc;
+    ZSTD_CCtx* ref;    /* contains reference to dictionary */
+    ZSTD_CCtx* zc;     /* working context */
    void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
 } EStats_ress_t;

 #define MAXREPOFFSET 1024

 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
-                            U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
-                            const void* src, size_t srcSize, U32 notificationLevel)
+                              U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
+                              const void* src, size_t srcSize,
+                              U32 notificationLevel)
 {
    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
    size_t cSize;

    if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
-    {  size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
-            if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
+    {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
+        if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
    }
    cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
    if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }

    if (cSize) {  /* if == 0; block is not compressible */
-        const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
+        const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);

        /* literals stats */
        {   const BYTE* bytePtr;
@ -659,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
    }
 }

+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}

 #define OFFCODE_MAX 30  /* only applicable to first block */
 static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
@ -688,6 +707,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
    BYTE* dstPtr = (BYTE*)dstBuffer;

    /* init */
+    DEBUGLOG(4, "ZDICT_analyzeEntropy");
    esr.ref = ZSTD_createCCtx();
    esr.zc = ZSTD_createCCtx();
    esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
@ -713,7 +733,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
            goto _cleanup;
    }   }

-    /* collect stats on all files */
+    /* collect stats on all samples */
    for (u=0; u<nbFiles; u++) {
        ZDICT_countEStats(esr, params,
                          countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
@ -722,14 +742,21 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
        pos += fileSizes[u];
    }

-    /* analyze */
-    errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
-    if (HUF_isError(errorCode)) {
-        eSize = ERROR(GENERIC);
-        DISPLAYLEVEL(1, "HUF_buildCTable error \n");
-        goto _cleanup;
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+        if (HUF_isError(maxNbBits)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
    }
-    huffLog = (U32)errorCode;

    /* looking for most common first offsets */
    {   U32 offset;
@ -850,6 +877,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
    U32 const notificationLevel = params.notificationLevel;

    /* check conditions */
+    DEBUGLOG(4, "ZDICT_finalizeDictionary");
    if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
    if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
@ -1025,8 +1053,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy(
 }


-/* issue : samplesBuffer need to be followed by a noisy guard band.
-*  work around : duplicate the buffer, and add the noise */
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
                              ZDICT_legacy_params_t params)
@ -1054,18 +1083,22 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
    ZDICT_cover_params_t params;
+    DEBUGLOG(3, "ZDICT_trainFromBuffer");
    memset(&params, 0, sizeof(params));
    params.d = 8;
    params.steps = 4;
-    /* Default to level 6 since no compression level information is avaialble */
+    /* Default to level 6 since no compression level information is available */
    params.zParams.compressionLevel = 6;
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+    params.zParams.notificationLevel = ZSTD_DEBUG;
+#endif
    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
-                                               samplesBuffer, samplesSizes,
-                                               nbSamples, &params);
+                                               samplesBuffer, samplesSizes, nbSamples,
+                                               &params);
 }

 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
-                                        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
+                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
 {
    ZDICT_params_t params;
    memset(&params, 0, sizeof(params));
--- a/sys/contrib/zstd/lib/dictBuilder/zdict.h
+++ b/sys/contrib/zstd/lib/dictBuilder/zdict.h
@ -38,21 +38,21 @@ extern "C" {


 /*! ZDICT_trainFromBuffer():
- * Train a dictionary from an array of samples.
- * Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
- * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
- * The resulting dictionary will be saved into `dictBuffer`.
+ *  Train a dictionary from an array of samples.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
 * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *           or an error code, which can be tested with ZDICT_isError().
- * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
- * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
- *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
- *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
 *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
 */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                       const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);


 /*======   Helper functions   ======*/
@ -72,14 +72,14 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
 * ==================================================================================== */

 typedef struct {
-    int      compressionLevel;   /* 0 means default; target a specific zstd compression level */
-    unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
-    unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */
+    int      compressionLevel;   /* optimize for a specific zstd compression level; 0 means default */
+    unsigned notificationLevel;  /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
+    unsigned dictID;             /* force dictID value; 0 means auto mode (32-bits random value) */
 } ZDICT_params_t;

 /*! ZDICT_cover_params_t:
- *  For all values 0 means default.
 *  k and d are the only required parameters.
+ *  For others, value 0 means default.
 */
 typedef struct {
    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
@ -91,28 +91,28 @@ typedef struct {


 /*! ZDICT_trainFromBuffer_cover():
- * Train a dictionary from an array of samples using the COVER algorithm.
- * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
- * The resulting dictionary will be saved into `dictBuffer`.
+ *  Train a dictionary from an array of samples using the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
 * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *           or an error code, which can be tested with ZDICT_isError().
- * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
- * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
- *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
- *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
 *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
 */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
-    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples,
-    ZDICT_cover_params_t parameters);
+          void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t parameters);

 /*! ZDICT_optimizeTrainFromBuffer_cover():
 * The same requirements as above hold for all the parameters except `parameters`.
 * This function tries many parameter combinations and picks the best parameters.
- * `*parameters` is filled with the best parameters found, and the dictionary
- * constructed with those parameters is stored in `dictBuffer`.
+ * `*parameters` is filled with the best parameters found,
+ * dictionary constructed with those parameters is stored in `dictBuffer`.
 *
 * All of the parameters d, k, steps are optional.
 * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
@ -125,9 +125,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
 * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
 */
 ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
-    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples,
-    ZDICT_cover_params_t *parameters);
+          void* dictBuffer, size_t dictBufferCapacity,
+    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+          ZDICT_cover_params_t* parameters);

 /*! ZDICT_finalizeDictionary():
 * Given a custom content as a basis for dictionary, and a set of samples,
@ -157,22 +157,23 @@ typedef struct {
 } ZDICT_legacy_params_t;

 /*! ZDICT_trainFromBuffer_legacy():
- * Train a dictionary from an array of samples.
- * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
- * The resulting dictionary will be saved into `dictBuffer`.
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
 * `parameters` is optional and can be provided with values set to 0 to mean "default".
 * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *           or an error code, which can be tested with ZDICT_isError().
- * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
- *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
- *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
 *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
- * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
+ *  Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
 */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
-    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters);
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_legacy_params_t parameters);

 /* Deprecation warnings */
 /* It is generally possible to disable deprecation warnings from compiler,
--- a/sys/contrib/zstd/lib/legacy/zstd_legacy.h
+++ b/sys/contrib/zstd/lib/legacy/zstd_legacy.h
@ -246,6 +246,7 @@ MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version)
 MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion,
                                        const void* dict, size_t dictSize)
 {
+    DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion);
    if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion);
    switch(newVersion)
    {
@ -304,6 +305,7 @@ MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U
 MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version,
                                              ZSTD_outBuffer* output, ZSTD_inBuffer* input)
 {
+    DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version);
    switch(version)
    {
        default :
--- a/sys/contrib/zstd/lib/legacy/zstd_v04.c
+++ b/sys/contrib/zstd/lib/legacy/zstd_v04.c
@ -16,37 +16,6 @@

 /* ******************************************************************
   mem.h
-   low-level memory access routines
-   Copyright (C) 2013-2015, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    You can contact the author at :
-    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
-    - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 #ifndef MEM_H_MODULE
 #define MEM_H_MODULE
@ -103,6 +72,44 @@ extern "C" {
 #endif


+/*-*************************************
+*  Debug
+***************************************/
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
+
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+#  include <stdio.h>
+extern int g_debuglog_enable;
+/* recommended values for ZSTD_DEBUG display levels :
+ * 1 : no display, enables assert() only
+ * 2 : reserved for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (*very* verbose) */
+#  define RAWLOG(l, ...) {                                      \
+                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if ((g_debuglog_enable) & (l<=ZSTD_DEBUG)) {    \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
 /****************************************************************
 *  Memory I/O
 *****************************************************************/
@ -255,34 +262,6 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr)
 /*
    zstd - standard compression library
    Header File for static linking only
-    Copyright (C) 2014-2015, Yann Collet.
-
-    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-    * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
 */
 #ifndef ZSTD_STATIC_H
 #define ZSTD_STATIC_H
@ -392,34 +371,6 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t maxDstS
 /*
    zstd_internal - common functions to include
    Header File for include
-    Copyright (C) 2014-2015, Yann Collet.
-
-    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-    * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    You can contact the author at :
-    - zstd source repository : https://github.com/Cyan4973/zstd
-    - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
 */
 #ifndef ZSTD_CCOMMON_H_MODULE
 #define ZSTD_CCOMMON_H_MODULE
@ -507,36 +458,6 @@ static void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
 /* ******************************************************************
   FSE : Finite State Entropy coder
   header file
-   Copyright (C) 2013-2015, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-   - Public forum : https://groups.google.com/forum/#!forum/lz4c
 ****************************************************************** */
 #ifndef FSE_H
 #define FSE_H
@ -3528,12 +3449,14 @@ static size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDs
    char* const oend = ostart + *maxDstSizePtr;
    U32 notDone = 1;

+    DEBUGLOG(5, "ZBUFF_decompressContinue");
    while (notDone)
    {
        switch(zbc->stage)
        {

        case ZBUFFds_init :
+            DEBUGLOG(5, "ZBUFF_decompressContinue: stage==ZBUFFds_init => ERROR(init_missing)");
            return ERROR(init_missing);

        case ZBUFFds_readHeader :
@ -3733,7 +3656,7 @@ size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSi


 ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void) { return ZBUFF_createDCtx(); }
-size_t      ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx) { return ZBUFF_freeDCtx(dctx); }
+size_t ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx) { return ZBUFF_freeDCtx(dctx); }

 size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx) { return ZBUFF_decompressInit(dctx); }
 size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* src, size_t srcSize)
@ -3741,6 +3664,7 @@ size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* src, s

 size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr)
 {
+    DEBUGLOG(5, "ZBUFFv04_decompressContinue");
    return ZBUFF_decompressContinue(dctx, dst, maxDstSizePtr, src, srcSizePtr);
 }

--- a/sys/contrib/zstd/lib/legacy/zstd_v06.c
+++ b/sys/contrib/zstd/lib/legacy/zstd_v06.c
@ -189,7 +189,7 @@ MEM_STATIC U32 MEM_swap32(U32 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
    return _byteswap_ulong(in);
-#elif defined (__GNUC__)
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
    return __builtin_bswap32(in);
 #else
    return  ((in << 24) & 0xff000000 ) |
@ -203,7 +203,7 @@ MEM_STATIC U64 MEM_swap64(U64 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
    return _byteswap_uint64(in);
-#elif defined (__GNUC__)
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
    return __builtin_bswap64(in);
 #else
    return  ((in << 56) & 0xff00000000000000ULL) |
--- a/sys/contrib/zstd/lib/legacy/zstd_v07.c
+++ b/sys/contrib/zstd/lib/legacy/zstd_v07.c
@ -348,7 +348,7 @@ MEM_STATIC U32 MEM_swap32(U32 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
    return _byteswap_ulong(in);
-#elif defined (__GNUC__)
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
    return __builtin_bswap32(in);
 #else
    return  ((in << 24) & 0xff000000 ) |
@ -362,7 +362,7 @@ MEM_STATIC U64 MEM_swap64(U64 in)
 {
 #if defined(_MSC_VER)     /* Visual Studio */
    return _byteswap_uint64(in);
-#elif defined (__GNUC__)
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
    return __builtin_bswap64(in);
 #else
    return  ((in << 56) & 0xff00000000000000ULL) |
--- a/sys/contrib/zstd/lib/zstd.h
+++ b/sys/contrib/zstd/lib/zstd.h
@ -45,11 +45,11 @@ extern "C" {
  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
  Compression can be done in:
    - a single step (described as Simple API)
-    - a single step, reusing a context (described as Explicit memory management)
+    - a single step, reusing a context (described as Explicit context)
    - unbounded multiple steps (described as Streaming compression)
  The compression ratio achievable on small data can be highly improved using a dictionary in:
    - a single step (described as Simple dictionary API)
-    - a single step, reusing a dictionary (described as Fast dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing dictionary API)

  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
  Advanced experimental APIs shall never be used with a dynamic library.
@ -59,7 +59,7 @@ extern "C" {
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    3
-#define ZSTD_VERSION_RELEASE  3
+#define ZSTD_VERSION_RELEASE  4

 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll version */
@ -68,7 +68,7 @@ ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< useful to check dll versio
 #define ZSTD_QUOTE(str) #str
 #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
 #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
-ZSTDLIB_API const char* ZSTD_versionString(void);   /* v1.3.0 */
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* added in v1.3.0 */


 /***************************************
@ -92,7 +92,7 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
 ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
                              const void* src, size_t compressedSize);

-/*! ZSTD_getFrameContentSize() : v1.3.0
+/*! ZSTD_getFrameContentSize() : added in v1.3.0
 *  `src` should point to the start of a ZSTD encoded frame.
 *  `srcSize` must be at least as large as the frame header.
 *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
@ -120,26 +120,24 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t

 /*! ZSTD_getDecompressedSize() :
 *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
- *  Both functions work the same way,
- *  but ZSTD_getDecompressedSize() blends
- *  "empty", "unknown" and "error" results in the same return value (0),
- *  while ZSTD_getFrameContentSize() distinguishes them.
- *
- *  'src' is the start of a zstd compressed frame.
- *  @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * `src` is the start of a zstd compressed frame.
+ * @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise. */
 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);


 /*======  Helper functions  ======*/
 #define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
 ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
 ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */


 /***************************************
-*  Explicit memory management
+*  Explicit context
 ***************************************/
 /*= Compression context
 *  When compressing many times,
@ -345,7 +343,7 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output
 * *******************************************************************************/

 typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
-                                 /* Continue to distinguish them for compatibility with versions <= v1.2.0 */
+                                 /* For compatibility with versions <= v1.2.0, continue to consider them separated. */
 /*===== ZSTD_DStream management functions =====*/
 ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
 ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
@ -375,23 +373,24 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output
 /* --- Constants ---*/
 #define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
 #define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
-#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* v0.7+ */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437   /* >= v0.7.0 */

 #define ZSTD_WINDOWLOG_MAX_32   30
 #define ZSTD_WINDOWLOG_MAX_64   31
 #define ZSTD_WINDOWLOG_MAX    ((unsigned)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
 #define ZSTD_WINDOWLOG_MIN      10
-#define ZSTD_HASHLOG_MAX        MIN(ZSTD_WINDOWLOG_MAX, 30)
+#define ZSTD_HASHLOG_MAX      ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
 #define ZSTD_HASHLOG_MIN         6
-#define ZSTD_CHAINLOG_MAX       MIN(ZSTD_WINDOWLOG_MAX+1, 30)
+#define ZSTD_CHAINLOG_MAX_32    29
+#define ZSTD_CHAINLOG_MAX_64    30
+#define ZSTD_CHAINLOG_MAX     ((unsigned)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
 #define ZSTD_CHAINLOG_MIN       ZSTD_HASHLOG_MIN
 #define ZSTD_HASHLOG3_MAX       17
 #define ZSTD_SEARCHLOG_MAX     (ZSTD_WINDOWLOG_MAX-1)
 #define ZSTD_SEARCHLOG_MIN       1
 #define ZSTD_SEARCHLENGTH_MAX    7   /* only for ZSTD_fast, other strategies are limited to 6 */
 #define ZSTD_SEARCHLENGTH_MIN    3   /* only for ZSTD_btopt, other strategies are limited to 4 */
-#define ZSTD_TARGETLENGTH_MIN    4   /* only useful for btopt */
-#define ZSTD_TARGETLENGTH_MAX  999   /* only useful for btopt */
+#define ZSTD_TARGETLENGTH_MIN    1   /* only used by btopt, btultra and btfast */
 #define ZSTD_LDM_MINMATCH_MIN    4
 #define ZSTD_LDM_MINMATCH_MAX 4096
 #define ZSTD_LDM_BUCKETSIZELOG_MAX 8
@ -432,12 +431,17 @@ typedef struct {

 typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;

-/*--- Custom memory allocation functions ---*/
-typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
-typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
-typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
-/* use this constant to defer to stdlib's functions */
-static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };
+typedef enum {
+    ZSTD_dct_auto=0,      /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent,  /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict     /* refuses to load a dictionary if it does not respect Zstandard's specification */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef,      /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+


 /***************************************
@ -483,12 +487,12 @@ ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);


 /***************************************
-*  Context memory usage
+*  Memory management
 ***************************************/

 /*! ZSTD_sizeof_*() :
 *  These functions give the current memory usage of selected object.
- *  Object memory usage can evolve when re-used multiple times. */
+ *  Object memory usage can evolve when re-used. */
 ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
 ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
 ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
@ -503,8 +507,8 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 *  It will also consider src size to be arbitrarily "large", which is worst case.
 *  If srcSize is known to always be small, ZSTD_estimateCCtxSize_usingCParams() can provide a tighter estimation.
 *  ZSTD_estimateCCtxSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
- *  ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbThreads is > 1.
- *  Note : CCtx estimation is only correct for single-threaded compression */
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+ *  Note : CCtx size estimation is only correct for single-threaded compression. */
 ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
 ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
@ -515,8 +519,8 @@ ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
 *  It will also consider src size to be arbitrarily "large", which is worst case.
 *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
 *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
- *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbThreads is set to a value > 1.
- *  Note : CStream estimation is only correct for single-threaded compression.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParam_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_p_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
 *  ZSTD_DStream memory budget depends on window Size.
 *  This information can be passed manually, using ZSTD_estimateDStreamSize,
 *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
@ -529,46 +533,86 @@ ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_para
 ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
 ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);

-typedef enum {
-    ZSTD_dlm_byCopy = 0,     /**< Copy dictionary content internally */
-    ZSTD_dlm_byRef,          /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
-} ZSTD_dictLoadMethod_e;
-
 /*! ZSTD_estimate?DictSize() :
 *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
- *  ZSTD_estimateCStreamSize_advanced_usingCParams() makes it possible to control precisely compression parameters, like ZSTD_createCDict_advanced().
- *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
 */
 ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
 ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
 ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);

+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_customMem customMem);
+
+

 /***************************************
 *  Advanced compression functions
 ***************************************/
-/*! ZSTD_createCCtx_advanced() :
- *  Create a ZSTD compression context using external alloc and free functions */
-ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
-
-/*! ZSTD_initStaticCCtx() : initialize a fixed-size zstd compression context
- *  workspace: The memory area to emplace the context into.
- *             Provided pointer must 8-bytes aligned.
- *             It must outlive context usage.
- *  workspaceSize: Use ZSTD_estimateCCtxSize() or ZSTD_estimateCStreamSize()
- *                 to determine how large workspace must be to support scenario.
- * @return : pointer to ZSTD_CCtx* (same address as workspace, but different type),
- *           or NULL if error (typically size too small)
- *  Note : zstd will never resize nor malloc() when using a static cctx.
- *         If it needs more memory than available, it will simply error out.
- *  Note 2 : there is no corresponding "free" function.
- *           Since workspace was allocated externally, it must be freed externally too.
- *  Limitation 1 : currently not compatible with internal CDict creation, such as
- *                 ZSTD_CCtx_loadDictionary() or ZSTD_initCStream_usingDict().
- *  Limitation 2 : currently not compatible with multi-threading
- */
-ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
-

 /*! ZSTD_createCDict_byReference() :
 *  Create a digested dictionary for compression
@ -576,38 +620,6 @@ ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize
 *  It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);

-typedef enum { ZSTD_dm_auto=0,        /* dictionary is "full" if it starts with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
-               ZSTD_dm_rawContent,    /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
-               ZSTD_dm_fullDict       /* refuses to load a dictionary if it does not respect Zstandard's specification */
-} ZSTD_dictMode_e;
-/*! ZSTD_createCDict_advanced() :
- *  Create a ZSTD_CDict using external alloc and free, and customized compression parameters */
-ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
-                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
-                                                  ZSTD_dictMode_e dictMode,
-                                                  ZSTD_compressionParameters cParams,
-                                                  ZSTD_customMem customMem);
-
-/*! ZSTD_initStaticCDict() :
- *  Generate a digested dictionary in provided memory area.
- *  workspace: The memory area to emplace the dictionary into.
- *             Provided pointer must 8-bytes aligned.
- *             It must outlive dictionary usage.
- *  workspaceSize: Use ZSTD_estimateCDictSize()
- *                 to determine how large workspace must be.
- *  cParams : use ZSTD_getCParams() to transform a compression level
- *            into its relevants cParams.
- * @return : pointer to ZSTD_CDict* (same address as workspace, but different type),
- *           or NULL if error (typically, size too small).
- *  Note : there is no corresponding "free" function.
- *         Since workspace was allocated externally, it must be freed externally.
- */
-ZSTDLIB_API ZSTD_CDict* ZSTD_initStaticCDict(
-                            void* workspace, size_t workspaceSize,
-                      const void* dict, size_t dictSize,
-                            ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictMode_e dictMode,
-                            ZSTD_compressionParameters cParams);
-
 /*! ZSTD_getCParams() :
 *   @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
 *   `estimatedSrcSize` value is optional, select 0 if not known */
@ -652,28 +664,6 @@ ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
 *  Note 3 : Skippable Frame Identifiers are considered valid. */
 ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);

-/*! ZSTD_createDCtx_advanced() :
- *  Create a ZSTD decompression context using external alloc and free functions */
-ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
-
-/*! ZSTD_initStaticDCtx() : initialize a fixed-size zstd decompression context
- *  workspace: The memory area to emplace the context into.
- *             Provided pointer must 8-bytes aligned.
- *             It must outlive context usage.
- *  workspaceSize: Use ZSTD_estimateDCtxSize() or ZSTD_estimateDStreamSize()
- *                 to determine how large workspace must be to support scenario.
- * @return : pointer to ZSTD_DCtx* (same address as workspace, but different type),
- *           or NULL if error (typically size too small)
- *  Note : zstd will never resize nor malloc() when using a static dctx.
- *         If it needs more memory than available, it will simply error out.
- *  Note 2 : static dctx is incompatible with legacy support
- *  Note 3 : there is no corresponding "free" function.
- *           Since workspace was allocated externally, it must be freed externally.
- *  Limitation : currently not compatible with internal DDict creation,
- *               such as ZSTD_initDStream_usingDict().
- */
-ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
-
 /*! ZSTD_createDDict_byReference() :
 *  Create a digested dictionary, ready to start decompression operation without startup delay.
 *  Dictionary content is referenced, and therefore stays in dictBuffer.
@ -681,26 +671,6 @@ ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize
 *  it must remain read accessible throughout the lifetime of DDict */
 ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);

-/*! ZSTD_createDDict_advanced() :
- *  Create a ZSTD_DDict using external alloc and free, optionally by reference */
-ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
-                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
-                                                  ZSTD_customMem customMem);
-
-/*! ZSTD_initStaticDDict() :
- *  Generate a digested dictionary in provided memory area.
- *  workspace: The memory area to emplace the dictionary into.
- *             Provided pointer must 8-bytes aligned.
- *             It must outlive dictionary usage.
- *  workspaceSize: Use ZSTD_estimateDDictSize()
- *                 to determine how large workspace must be.
- * @return : pointer to ZSTD_DDict*, or NULL if error (size too small)
- *  Note : there is no corresponding "free" function.
- *         Since workspace was allocated externally, it must be freed externally.
- */
-ZSTDLIB_API ZSTD_DDict* ZSTD_initStaticDDict(void* workspace, size_t workspaceSize,
-                                             const void* dict, size_t dictSize,
-                                             ZSTD_dictLoadMethod_e dictLoadMethod);

 /*! ZSTD_getDictID_fromDict() :
 *  Provides the dictID stored within dictionary.
@ -732,8 +702,6 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
 ********************************************************************/

 /*=====   Advanced Streaming compression functions  =====*/
-ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
-ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
 ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   /**< pledgedSrcSize must be correct. If it is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, "0" also disables frame content size field. It may be enabled in the future. */
 ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< creates of an internal CDict (incompatible with static CCtx), except if dict == NULL or dictSize < 8, in which case no dict is used. Note: dict is loaded with ZSTD_dm_auto (treated as a full zstd dictionary if it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.*/
 ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
@ -748,14 +716,28 @@ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const
 *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
 *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
 *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
- *  but it may change to mean "empty" in some future version, so prefer using macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
 * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
 ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);


+typedef struct {
+    unsigned long long ingested;
+    unsigned long long consumed;
+    unsigned long long produced;
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Therefore, (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Can report progression inside worker threads (multi-threading and non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+
+
 /*=====   Advanced Streaming decompression functions  =====*/
-ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
-ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
 typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
 ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);   /* obsolete : this API will be removed in a future version */
 ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: no dictionary will be used if dict == NULL or dictSize < 8 */
@ -924,10 +906,8 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 *   and then applied on all subsequent compression jobs.
 *   When no parameter is ever provided, CCtx is created with compression level ZSTD_CLEVEL_DEFAULT.
 *
- *   This API is intended to replace all others experimental API.
- *   It can basically do all other use cases, and even new ones.
- *   In constrast with _advanced() variants, it stands a reasonable chance to become "stable",
- *   after a good testing period.
+ *   This API is intended to replace all others advanced / experimental API entry points.
+ *   But it stands a reasonable chance to become "stable", after a reasonable testing period.
 */

 /* note on naming convention :
@ -944,12 +924,12 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 * All enum will be pinned to explicit values before reaching "stable API" status */

 typedef enum {
-    /* Question : should we have a format ZSTD_f_auto ?
-     * For the time being, it would mean exactly the same as ZSTD_f_zstd1.
-     * But, in the future, should several formats be supported,
+    /* Opened question : should we have a format ZSTD_f_auto ?
+     * Today, it would mean exactly the same as ZSTD_f_zstd1.
+     * But, in the future, should several formats become supported,
     * on the compression side, it would mean "default format".
-     * On the decompression side, it would mean "multi format",
-     * and ZSTD_f_zstd1 could be reserved to mean "accept *only* zstd frames".
+     * On the decompression side, it would mean "automatic format detection",
+     * so that ZSTD_f_zstd1 would mean "accept *only* zstd frames".
     * Since meaning is a little different, another option could be to define different enums for compression and decompression.
     * This question could be kept for later, when there are actually multiple formats to support,
     * but there is also the question of pinning enum values, and pinning value `0` is especially important */
@ -967,42 +947,76 @@ typedef enum {
    /* compression parameters */
    ZSTD_p_compressionLevel=100, /* Update all compression parameters according to pre-defined cLevel table
                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
-                              * Special: value 0 means "do not change cLevel". */
+                              * Special: value 0 means "do not change cLevel".
+                              * Note 1 : it's possible to pass a negative compression level by casting it to unsigned type.
+                              * Note 2 : setting a level sets all default values of other compression parameters.
+                              * Note 3 : setting compressionLevel automatically updates ZSTD_p_compressLiterals. */
    ZSTD_p_windowLog,        /* Maximum allowed back-reference distance, expressed as power of 2.
                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
-                              * Special: value 0 means "do not change windowLog".
+                              * Special: value 0 means "use default windowLog".
                              * Note: Using a window size greater than ZSTD_MAXWINDOWSIZE_DEFAULT (default: 2^27)
-                              * requires setting the maximum window size at least as large during decompression. */
+                              *       requires explicitly allowing such window size during decompression stage. */
    ZSTD_p_hashLog,          /* Size of the probe table, as a power of 2.
                              * Resulting table size is (1 << (hashLog+2)).
                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
                              * Larger tables improve compression ratio of strategies <= dFast,
                              * and improve speed of strategies > dFast.
-                              * Special: value 0 means "do not change hashLog". */
+                              * Special: value 0 means "use default hashLog". */
    ZSTD_p_chainLog,         /* Size of the full-search table, as a power of 2.
                              * Resulting table size is (1 << (chainLog+2)).
                              * Larger tables result in better and slower compression.
                              * This parameter is useless when using "fast" strategy.
-                              * Special: value 0 means "do not change chainLog". */
+                              * Special: value 0 means "use default chainLog". */
    ZSTD_p_searchLog,        /* Number of search attempts, as a power of 2.
                              * More attempts result in better and slower compression.
                              * This parameter is useless when using "fast" and "dFast" strategies.
-                              * Special: value 0 means "do not change searchLog". */
+                              * Special: value 0 means "use default searchLog". */
    ZSTD_p_minMatch,         /* Minimum size of searched matches (note : repCode matches can be smaller).
                              * Larger values make faster compression and decompression, but decrease ratio.
                              * Must be clamped between ZSTD_SEARCHLENGTH_MIN and ZSTD_SEARCHLENGTH_MAX.
                              * Note that currently, for all strategies < btopt, effective minimum is 4.
-                              * Note that currently, for all strategies > fast, effective maximum is 6.
-                              * Special: value 0 means "do not change minMatchLength". */
-    ZSTD_p_targetLength,     /* Only useful for strategies >= btopt.
-                              * Length of Match considered "good enough" to stop search.
-                              * Larger values make compression stronger and slower.
-                              * Special: value 0 means "do not change targetLength". */
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_p_targetLength,     /* Impact of this field depends on strategy.
+                              * For strategies btopt & btultra:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
    ZSTD_p_compressionStrategy, /* See ZSTD_strategy enum definition.
                              * Cast selected strategy as unsigned for ZSTD_CCtx_setParameter() compatibility.
                              * The higher the value of selected strategy, the more complex it is,
                              * resulting in stronger and slower compression.
-                              * Special: value 0 means "do not change strategy". */
+                              * Special: value 0 means "use default strategy". */
+
+    ZSTD_p_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                         * This parameter is designed to improve compression ratio
+                                         * for large inputs, by finding large matches at long distance.
+                                         * It increases memory usage and window size.
+                                         * Note: enabling this parameter increases ZSTD_p_windowLog to 128 MB
+                                         * except when expressly set to a different value. */
+    ZSTD_p_ldmHashLog,       /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_p_ldmMinMatch,      /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_p_ldmBucketSizeLog, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX .
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_p_ldmHashEveryLog,  /* Frequency of inserting/looking up entries in the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashEveryLog". */

    /* frame parameters */
    ZSTD_p_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
@ -1012,58 +1026,45 @@ typedef enum {
    ZSTD_p_dictIDFlag,       /* When applicable, dictionary's ID is written into frame header (default:1) */

    /* multi-threading parameters */
-    ZSTD_p_nbThreads=400,    /* Select how many threads a compression job can spawn (default:1)
-                              * More threads improve speed, but also increase memory usage.
-                              * Can only receive a value > 1 if ZSTD_MULTITHREAD is enabled.
-                              * Special: value 0 means "do not change nbThreads" */
-    ZSTD_p_jobSize,          /* Size of a compression job. This value is only enforced in streaming (non-blocking) mode.
-                              * Each compression job is completed in parallel, so indirectly controls the nb of active threads.
+    /* These parameters are only useful if multi-threading is enabled (ZSTD_MULTITHREAD).
+     * They return an error otherwise. */
+    ZSTD_p_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode :
+                              * ZSTD_compress_generic() consumes some input, flush some output if possible, and immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation sets ZSTD_e_end : it becomes a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+    ZSTD_p_jobSize,          /* Size of a compression job. This value is enforced only in non-blocking mode.
+                              * Each compression job is completed in parallel, so this value indirectly controls the nb of active threads.
                              * 0 means default, which is dynamically determined based on compression parameters.
-                              * Job size must be a minimum of overlapSize, or 1 KB, whichever is largest
+                              * Job size must be a minimum of overlapSize, or 1 MB, whichever is largest.
                              * The minimum size is automatically and transparently enforced */
    ZSTD_p_overlapSizeLog,   /* Size of previous input reloaded at the beginning of each job.
                              * 0 => no overlap, 6(default) => use 1/8th of windowSize, >=9 => use full windowSize */

-    /* advanced parameters - may not remain available after API update */
+    /* =================================================================== */
+    /* experimental parameters - no stability guaranteed                   */
+    /* =================================================================== */
+
+    ZSTD_p_compressLiterals=1000, /* control huffman compression of literals (enabled) by default.
+                              * disabling it improves speed and decreases compression ratio by a large amount.
+                              * note : this setting is automatically updated when changing compression level.
+                              *        positive compression levels set ZSTD_p_compressLiterals to 1.
+                              *        negative compression levels set ZSTD_p_compressLiterals to 0. */
+
    ZSTD_p_forceMaxWindow=1100, /* Force back-reference distances to remain < windowSize,
                              * even when referencing into Dictionary content (default:0) */
-    ZSTD_p_enableLongDistanceMatching=1200,  /* Enable long distance matching.
-                                         * This parameter is designed to improve the compression
-                                         * ratio for large inputs with long distance matches.
-                                         * This increases the memory usage as well as window size.
-                                         * Note: setting this parameter sets all the LDM parameters
-                                         * as well as ZSTD_p_windowLog. It should be set after
-                                         * ZSTD_p_compressionLevel and before ZSTD_p_windowLog and
-                                         * other LDM parameters. Setting the compression level
-                                         * after this parameter overrides the window log, though LDM
-                                         * will remain enabled until explicitly disabled. */
-    ZSTD_p_ldmHashLog,   /* Size of the table for long distance matching, as a power of 2.
-                          * Larger values increase memory usage and compression ratio, but decrease
-                          * compression speed.
-                          * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
-                          * (default: windowlog - 7). */
-    ZSTD_p_ldmMinMatch,  /* Minimum size of searched matches for long distance matcher.
-                          * Larger/too small values usually decrease compression ratio.
-                          * Must be clamped between ZSTD_LDM_MINMATCH_MIN
-                          * and ZSTD_LDM_MINMATCH_MAX (default: 64). */
-    ZSTD_p_ldmBucketSizeLog,  /* Log size of each bucket in the LDM hash table for collision resolution.
-                               * Larger values usually improve collision resolution but may decrease
-                               * compression speed.
-                               * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX (default: 3). */
-    ZSTD_p_ldmHashEveryLog,  /* Frequency of inserting/looking up entries in the LDM hash table.
-                              * The default is MAX(0, (windowLog - ldmHashLog)) to
-                              * optimize hash table usage.
-                              * Larger values improve compression speed. Deviating far from the
-                              * default value will likely result in a decrease in compression ratio.
-                              * Must be clamped between 0 and ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN. */

 } ZSTD_cParameter;


 /*! ZSTD_CCtx_setParameter() :
 *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression),
+ *  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
 *  Note : when `value` is an enum, cast it to unsigned for proper type checking.
- *  @result : informational value (typically, the one being set, possibly corrected),
+ *  @result : informational value (typically, value being set clamped correctly),
 *            or an error code (which can be tested with ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);

@ -1079,26 +1080,24 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param
 ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);

 /*! ZSTD_CCtx_loadDictionary() :
- *  Create an internal CDict from dict buffer.
- *  Decompression will have to use same buffer.
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
 * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
- *            meaning "return to no-dictionary mode".
- *  Note 1 : `dict` content will be copied internally. Use
- *            ZSTD_CCtx_loadDictionary_byReference() to reference dictionary
- *            content instead. The dictionary buffer must then outlive its
- *            users.
+ *  Special: Adding a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary will be used for all future compression jobs.
+ *           To return to "no-dictionary" situation, load a NULL dictionary
 *  Note 2 : Loading a dictionary involves building tables, which are dependent on compression parameters.
 *           For this reason, compression parameters cannot be changed anymore after loading a dictionary.
- *           It's also a CPU-heavy operation, with non-negligible impact on latency.
- *  Note 3 : Dictionary will be used for all future compression jobs.
- *           To return to "no-dictionary" situation, load a NULL dictionary
- *  Note 5 : Use ZSTD_CCtx_loadDictionary_advanced() to select how dictionary
- *           content will be interpreted.
- */
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use ZSTD_CCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictMode_e dictMode);
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);


 /*! ZSTD_CCtx_refCDict() :
@ -1110,8 +1109,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void
 *  Special : adding a NULL CDict means "return to no-dictionary mode".
 *  Note 1 : Currently, only one dictionary can be managed.
 *           Adding a new dictionary effectively "discards" any previous one.
- *  Note 2 : CDict is just referenced, its lifetime must outlive CCtx.
- */
+ *  Note 2 : CDict is just referenced, its lifetime must outlive CCtx. */
 ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);

 /*! ZSTD_CCtx_refPrefix() :
@ -1121,20 +1119,29 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
 *  Subsequent compression jobs will be done without prefix (if none is explicitly referenced).
 *  If there is a need to use same prefix multiple times, consider embedding it into a ZSTD_CDict instead.
 * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Special : Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
 *  Note 1 : Prefix buffer is referenced. It must outlive compression job.
 *  Note 2 : Referencing a prefix involves building tables, which are dependent on compression parameters.
- *           It's a CPU-heavy operation, with non-negligible impact on latency.
- *  Note 3 : By default, the prefix is treated as raw content
- *           (ZSTD_dm_rawContent). Use ZSTD_CCtx_refPrefix_advanced() to alter
- *           dictMode. */
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dm_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode. */
 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize);
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictMode_e dictMode);
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_reset() :
+ *  Return a CCtx to clean state.
+ *  Useful after an error, or to interrupt an ongoing compression job and start a new one.
+ *  Any internal data not yet flushed is cancelled.
+ *  Dictionary (if any) is dropped.
+ *  All parameters are back to default values.
+ *  It's possible to modify compression parameters after a reset.
+ */
+ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);



 typedef enum {
-    ZSTD_e_continue=0, /* collect more data, encoder transparently decides when to output result, for optimal conditions */
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal conditions */
    ZSTD_e_flush,      /* flush any data provided so far - frame will continue, future data can still reference previous data for better compression */
    ZSTD_e_end         /* flush any remaining data and close current frame. Any additional data starts a new frame. */
 } ZSTD_EndDirective;
@ -1150,10 +1157,11 @@ typedef enum {
 *                                                     and then immediately returns, just indicating that there is some data remaining to be flushed.
 *                                                     The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
 *  - Exception : in multi-threading mode, if the first call requests a ZSTD_e_end directive, it is blocking : it will complete compression before giving back control to caller.
- *  - @return provides the minimum amount of data remaining to be flushed from internal buffers
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
 *            or an error code, which can be tested using ZSTD_isError().
 *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
- *            This is useful to determine if a ZSTD_e_flush or ZSTD_e_end directive is completed.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
 *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
 *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
 *            Before starting a new compression job, or changing compression parameters,
@ -1164,16 +1172,6 @@ ZSTDLIB_API size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
                                          ZSTD_inBuffer* input,
                                          ZSTD_EndDirective endOp);

-/*! ZSTD_CCtx_reset() :
- *  Return a CCtx to clean state.
- *  Useful after an error, or to interrupt an ongoing compression job and start a new one.
- *  Any internal data not yet flushed is cancelled.
- *  Dictionary (if any) is dropped.
- *  All parameters are back to default values.
- *  It's possible to modify compression parameters after a reset.
- */
-ZSTDLIB_API void ZSTD_CCtx_reset(ZSTD_CCtx* cctx);   /* Not ready yet ! */
-

 /*! ZSTD_compress_generic_simpleArgs() :
 *  Same as ZSTD_compress_generic(),
@ -1207,25 +1205,26 @@ ZSTDLIB_API size_t ZSTD_compress_generic_simpleArgs (
 *  for static allocation for single-threaded compression.
 */
 ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);

-/*! ZSTD_resetCCtxParams() :
- *  Reset params to default, with the default compression level.
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
 */
-ZSTDLIB_API size_t ZSTD_resetCCtxParams(ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);

-/*! ZSTD_initCCtxParams() :
+/*! ZSTD_CCtxParams_init() :
 *  Initializes the compression parameters of cctxParams according to
 *  compression level. All other parameters are reset to their default values.
 */
-ZSTDLIB_API size_t ZSTD_initCCtxParams(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);

-/*! ZSTD_initCCtxParams_advanced() :
+/*! ZSTD_CCtxParams_init_advanced() :
 *  Initializes the compression and frame parameters of cctxParams according to
 *  params. All other parameters are reset to their default values.
 */
-ZSTDLIB_API size_t ZSTD_initCCtxParams_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);

-ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);

 /*! ZSTD_CCtxParam_setParameter() :
 *  Similar to ZSTD_CCtx_setParameter.
@ -1238,9 +1237,10 @@ ZSTDLIB_API size_t ZSTD_CCtxParam_setParameter(ZSTD_CCtx_params* params, ZSTD_cP

 /*! ZSTD_CCtx_setParametersUsingCCtxParams() :
 *  Apply a set of ZSTD_CCtx_params to the compression context.
- *  This must be done before the dictionary is loaded.
- *  The pledgedSrcSize is treated as unknown.
- *  Multithreading parameters are applied only if nbThreads > 1.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
 */
 ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
@ -1267,9 +1267,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
 *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to select
 *           how dictionary content will be interpreted and loaded.
 */
-ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);   /* not implemented */
-ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);   /* not implemented */
-ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictMode_e dictMode);   /* not implemented */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);


 /*! ZSTD_DCtx_refDDict() :
@ -1281,7 +1281,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void
 *  Special : adding a NULL DDict means "return to no-dictionary mode".
 *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
 */
-ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);   /* not implemented */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);


 /*! ZSTD_DCtx_refPrefix() :
@ -1295,8 +1295,8 @@ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
 *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode.
 *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
 */
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);   /* not implemented */
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictMode_e dictMode);   /* not implemented */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize);
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);


 /*! ZSTD_DCtx_setMaxWindowSize() :
@ -1389,7 +1389,7 @@ ZSTDLIB_API void ZSTD_DCtx_reset(ZSTD_DCtx* dctx);
 ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
 ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression */
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */


 #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
--- a/sys/contrib/zstd/programs/Makefile
+++ b/sys/contrib/zstd/programs/Makefile
@ -157,6 +157,8 @@ endif
 zstd-release: DEBUGFLAGS :=
 zstd-release: zstd

+zstd32 : CPPFLAGS += $(THREAD_CPP)
+zstd32 : LDFLAGS += $(THREAD_LD) 
 zstd32 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
 zstd32 : $(ZSTDLIB_FILES) zstdcli.c fileio.c bench.c datagen.c dibio.c
 ifneq (,$(filter Windows%,$(OS)))
--- a/sys/contrib/zstd/programs/README.md
+++ b/sys/contrib/zstd/programs/README.md
@ -11,44 +11,14 @@ There are however other Makefile targets that create different variations of CLI


 #### Compilation variables
-`zstd` scope can be altered by modifying the following compilation variables :
+`zstd` scope can be altered by modifying the following `make` variables :

 - __HAVE_THREAD__ : multithreading is automatically enabled when `pthread` is detected.
-  It's possible to disable multithread support, by setting HAVE_THREAD=0 .
-  Example : make zstd HAVE_THREAD=0
-  It's also possible to force compilation with multithread support, using HAVE_THREAD=1.
-  In which case, linking stage will fail if `pthread` library cannot be found.
-  This might be useful to prevent silent feature disabling.
-
- __HAVE_ZLIB__ : `zstd` can compress and decompress files in `.gz` format.
-  This is ordered through command `--format=gzip`.
-  Alternatively, symlinks named `gzip` or `gunzip` will mimic intended behavior.
-  `.gz` support is automatically enabled when `zlib` library is detected at build time.
-  It's possible to disable `.gz` support, by setting HAVE_ZLIB=0.
-  Example : make zstd HAVE_ZLIB=0
-  It's also possible to force compilation with zlib support, using HAVE_ZLIB=1.
-  In which case, linking stage will fail if `zlib` library cannot be found.
-  This might be useful to prevent silent feature disabling.
-
- __HAVE_LZMA__ : `zstd` can compress and decompress files in `.xz` and `.lzma` formats.
-  This is ordered through commands `--format=xz` and `--format=lzma` respectively.
-  Alternatively, symlinks named `xz`, `unxz`, `lzma`, or `unlzma` will mimic intended behavior.
-  `.xz` and `.lzma` support is automatically enabled when `lzma` library is detected at build time.
-  It's possible to disable `.xz` and `.lzma` support, by setting HAVE_LZMA=0 .
-  Example : make zstd HAVE_LZMA=0
-  It's also possible to force compilation with lzma support, using HAVE_LZMA=1.
-  In which case, linking stage will fail if `lzma` library cannot be found.
-  This might be useful to prevent silent feature disabling.
-
- __HAVE_LZ4__ : `zstd` can compress and decompress files in `.lz4` formats.
-  This is ordered through commands `--format=lz4`.
-  Alternatively, symlinks named `lz4`, or `unlz4` will mimic intended behavior.
-  `.lz4` support is automatically enabled when `lz4` library is detected at build time.
-  It's possible to disable `.lz4` support, by setting HAVE_LZ4=0 .
-  Example : make zstd HAVE_LZ4=0
-  It's also possible to force compilation with lz4 support, using HAVE_LZ4=1.
-  In which case, linking stage will fail if `lz4` library cannot be found.
-  This might be useful to prevent silent feature disabling.
+  It's possible to disable multithread support, by setting `HAVE_THREAD=0`.
+  Example : `make zstd HAVE_THREAD=0`
+  It's also possible to force multithread support, using `HAVE_THREAD=1`.
+  In which case, linking stage will fail if neither `pthread` nor `windows.h` library can be found.
+  This is useful to ensure this feature is not silently disabled.

 - __ZSTD_LEGACY_SUPPORT__ : `zstd` can decompress files compressed by older versions of `zstd`.
  Starting v0.8.0, all versions of `zstd` produce frames compliant with the [specification](../doc/zstd_compression_format.md), and are therefore compatible.
@ -61,11 +31,54 @@ There are however other Makefile targets that create different variations of CLI
  if `ZSTD_LEGACY_SUPPORT >= 8`, it's the same as `0`, since there is no legacy format after `7`.
  Note : `zstd` only supports decoding older formats, and cannot generate any legacy format.

+- __HAVE_ZLIB__ : `zstd` can compress and decompress files in `.gz` format.
+  This is ordered through command `--format=gzip`.
+  Alternatively, symlinks named `gzip` or `gunzip` will mimic intended behavior.
+  `.gz` support is automatically enabled when `zlib` library is detected at build time.
+  It's possible to disable `.gz` support, by setting `HAVE_ZLIB=0`.
+  Example : `make zstd HAVE_ZLIB=0`
+  It's also possible to force compilation with zlib support, `using HAVE_ZLIB=1`.
+  In which case, linking stage will fail if `zlib` library cannot be found.
+  This is useful to prevent silent feature disabling.
+
+- __HAVE_LZMA__ : `zstd` can compress and decompress files in `.xz` and `.lzma` formats.
+  This is ordered through commands `--format=xz` and `--format=lzma` respectively.
+  Alternatively, symlinks named `xz`, `unxz`, `lzma`, or `unlzma` will mimic intended behavior.
+  `.xz` and `.lzma` support is automatically enabled when `lzma` library is detected at build time.
+  It's possible to disable `.xz` and `.lzma` support, by setting `HAVE_LZMA=0` .
+  Example : `make zstd HAVE_LZMA=0`
+  It's also possible to force compilation with lzma support, using `HAVE_LZMA=1`.
+  In which case, linking stage will fail if `lzma` library cannot be found.
+  This is useful to prevent silent feature disabling.
+
+- __HAVE_LZ4__ : `zstd` can compress and decompress files in `.lz4` formats.
+  This is ordered through commands `--format=lz4`.
+  Alternatively, symlinks named `lz4`, or `unlz4` will mimic intended behavior.
+  `.lz4` support is automatically enabled when `lz4` library is detected at build time.
+  It's possible to disable `.lz4` support, by setting `HAVE_LZ4=0` .
+  Example : `make zstd HAVE_LZ4=0`
+  It's also possible to force compilation with lz4 support, using `HAVE_LZ4=1`.
+  In which case, linking stage will fail if `lz4` library cannot be found.
+  This is useful to prevent silent feature disabling.
+

 #### Aggregation of parameters
 CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.


+#### Symlink shortcuts
+It's possible to invoke `zstd` through a symlink.
+When the name of the symlink has a specific value, it triggers an associated behavior.
+- `zstdmt` : compress using all cores available on local system.
+- `zcat` : will decompress and output target file using any of the supported formats. `gzcat` and `zstdcat` are also equivalent.
+- `gzip` : if zlib support is enabled, will mimic `gzip` by compressing file using `.gz` format, removing source file by default (use `--keep` to preserve). If zlib is not supported, triggers an error.
+- `xz` : if lzma support is enabled, will mimic `xz` by compressing file using `.xz` format, removing source file by default (use `--keep` to preserve). If xz is not supported, triggers an error.
+- `lzma` : if lzma support is enabled, will mimic `lzma` by compressing file using `.lzma` format, removing source file by default (use `--keep` to preserve). If lzma is not supported, triggers an error.
+- `lz4` : if lz4 support is enabled, will mimic `lz4` by compressing file using `.lz4` format. If lz4 is not supported, triggers an error.
+- `unzstd` and `unlz4` will decompress any of the supported format.
+- `ungz`, `unxz` and `unlzma` will do the same, and will also remove source file by default (use `--keep` to preserve).
+
+
 #### Dictionary builder in Command Line Interface
 Zstd offers a training mode, which can be used to tune the algorithm for a selected
 type of data, by providing it with a few samples. The result of the training is stored
@ -107,7 +120,7 @@ Usage :
 FILE    : a filename
          with no FILE, or when FILE is - , read standard input
 Arguments :
- -#     : # compression level (1-19, default:3)
+ -#     : # compression level (1-19, default: 3)
 -d     : decompression
 -D file: use `file` as Dictionary
 -o file: result stored into `file` (only if 1 input file)
@ -125,13 +138,13 @@ Advanced arguments :
 --ultra : enable levels beyond 19, up to 22 (requires more memory)
 --long  : enable long distance matching (requires more memory)
 --no-dictID : don't write dictID into header (dictionary compression)
--[no-]check : integrity check (default:enabled)
+--[no-]check : integrity check (default: enabled)
 -r     : operate recursively on directories
 --format=gzip : compress files to the .gz format
 --format=xz : compress files to the .xz format
 --format=lzma : compress files to the .lzma format
 --test  : test compressed file integrity
--[no-]sparse : sparse mode (default:disabled)
+--[no-]sparse : sparse mode (default: disabled)
 -M#    : Set a memory usage limit for decompression
 --      : All arguments after "--" are treated as files

@ -140,13 +153,13 @@ Dictionary builder :
 --train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args
 --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
 -o file : `file` is dictionary name (default: dictionary)
--maxdict=# : limit dictionary to specified size (default : 112640)
+--maxdict=# : limit dictionary to specified size (default: 112640)
 --dictID=# : force dictionary ID to specified value (default: random)

 Benchmark arguments :
- -b#    : benchmark file(s), using # compression level (default : 1)
+ -b#    : benchmark file(s), using # compression level (default: 3)
 -e#    : test all compression levels from -bX to # (default: 1)
- -i#    : minimum evaluation time in seconds (default : 3s)
+ -i#    : minimum evaluation time in seconds (default: 3s)
 -B#    : cut file into independent blocks of size # (default: no block)
 --priority=rt : set process priority to real-time
 ```
@ -155,7 +168,7 @@ Benchmark arguments :
 #### Long distance matching mode
 The long distance matching mode, enabled with `--long`, is designed to improve
 the compression ratio for files with long matches at a large distance (up to the
-maximum window size, `128 MiB`) while still maintaining compression speed. 
+maximum window size, `128 MiB`) while still maintaining compression speed.

 Enabling this mode sets the window size to `128 MiB` and thus increases the memory
 usage for both the compressor and decompressor. Performance in terms of speed is
@ -168,7 +181,7 @@ decompression speed with and without long distance matching on an ideal use
 case: a tar of four versions of clang (versions `3.4.1`, `3.4.2`, `3.5.0`,
 `3.5.1`) with a total size of `244889600 B`. This is an ideal use case as there
 are many long distance matches within the maximum window size of `128 MiB` (each
-version is less than `128 MiB`). 
+version is less than `128 MiB`).

 Compression Speed vs Ratio | Decompression Speed
 ---------------------------|---------------------
@ -202,8 +215,3 @@ The below table illustrates this on the [Silesia compression corpus].
 | `zstd -5 --long` | `3.319` | `51.7 MB/s` | `371.9 MB/s` |
 | `zstd -10` | `3.523`    | `16.4 MB/s`   | `489.2 MB/s`  |
 | `zstd -10 --long`| `3.566` | `16.2 MB/s` | `415.7 MB/s`  |
-
-
-
-
-
--- a/sys/contrib/zstd/programs/bench.c
+++ b/sys/contrib/zstd/programs/bench.c
@ -22,7 +22,7 @@
 *  Compiler Warnings
 ****************************************/
 #ifdef _MSC_VER
-#  pragma warning(disable : 4127)                /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
 #endif


@ -34,6 +34,7 @@
 #include <stdlib.h>      /* malloc, free */
 #include <string.h>      /* memset */
 #include <stdio.h>       /* fprintf, fopen */
+#include <assert.h>      /* assert */

 #include "mem.h"
 #define ZSTD_STATIC_LINKING_ONLY
@ -51,8 +52,9 @@
 #  define ZSTD_GIT_COMMIT_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_GIT_COMMIT)
 #endif

-#define TIMELOOP_MICROSEC     1*1000000ULL /* 1 second */
-#define ACTIVEPERIOD_MICROSEC 70*1000000ULL /* 70 seconds */
+#define TIMELOOP_MICROSEC     (1*1000000ULL) /* 1 second */
+#define TIMELOOP_NANOSEC      (1*1000000000ULL) /* 1 second */
+#define ACTIVEPERIOD_MICROSEC (70*TIMELOOP_MICROSEC) /* 70 seconds */
 #define COOLPERIOD_SEC        10

 #define KB *(1 <<10)
@ -122,12 +124,12 @@ void BMK_setBlockSize(size_t blockSize)

 void BMK_setDecodeOnlyMode(unsigned decodeFlag) { g_decodeOnly = (decodeFlag>0); }

-static U32 g_nbThreads = 1;
-void BMK_setNbThreads(unsigned nbThreads) {
+static U32 g_nbWorkers = 0;
+void BMK_setNbWorkers(unsigned nbWorkers) {
 #ifndef ZSTD_MULTITHREAD
-    if (nbThreads > 1) DISPLAYLEVEL(2, "Note : multi-threading is disabled \n");
+    if (nbWorkers > 0) DISPLAYLEVEL(2, "Note : multi-threading is disabled \n");
 #endif
-    g_nbThreads = nbThreads;
+    g_nbWorkers = nbWorkers;
 }

 static U32 g_realTime = 0;
@ -212,6 +214,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,

    /* init */
    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* display last 17 characters */
+    if (g_nbWorkers==1) g_nbWorkers=0;   /* prefer synchronous mode */

    if (g_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
        const char* srcPtr = (const char*)srcBuffer;
@ -258,13 +261,19 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
    }   }   }

    /* warmimg up memory */
-    RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.50, 1);
+    if (g_decodeOnly) {
+        memcpy(compressedBuffer, srcBuffer, loadedCompressedSize);
+    } else {
+        RDG_genBuffer(compressedBuffer, maxCompressedSize, 0.10, 0.50, 1);
+    }

    /* Bench */
    {   U64 fastestC = (U64)(-1LL), fastestD = (U64)(-1LL);
        U64 const crcOrig = g_decodeOnly ? 0 : XXH64(srcBuffer, srcSize, 0);
        UTIL_time_t coolTime;
-        U64 const maxTime = (g_nbSeconds * TIMELOOP_MICROSEC) + 1;
+        U64 const maxTime = (g_nbSeconds * TIMELOOP_NANOSEC) + 1;
+        U32 nbDecodeLoops = (U32)((100 MB) / (srcSize+1)) + 1;  /* initial conservative speed estimate */
+        U32 nbCompressionLoops = (U32)((2 MB) / (srcSize+1)) + 1;  /* initial conservative speed estimate */
        U64 totalCTime=0, totalDTime=0;
        U32 cCompleted=g_decodeOnly, dCompleted=0;
 #       define NB_MARKS 4
@ -283,19 +292,17 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
            }

            if (!g_decodeOnly) {
-                UTIL_time_t clockStart;
                /* Compression */
                DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize);
                if (!cCompleted) memset(compressedBuffer, 0xE5, maxCompressedSize);  /* warm up and erase result buffer */

-                UTIL_sleepMilli(1);  /* give processor time to other processes */
+                UTIL_sleepMilli(5);  /* give processor time to other processes */
                UTIL_waitForNextTick();
-                clockStart = UTIL_getTime();

                if (!cCompleted) {   /* still some time to do compression tests */
-                    U64 const clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1;
                    U32 nbLoops = 0;
-                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbThreads, g_nbThreads);
+                    UTIL_time_t const clockStart = UTIL_getTime();
+                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, g_nbWorkers);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionLevel, cLevel);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_enableLongDistanceMatching, g_ldmFlag);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmMinMatch, g_ldmMinMatch);
@ -307,13 +314,16 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                      ZSTD_CCtx_setParameter(ctx, ZSTD_p_ldmHashEveryLog, g_ldmHashEveryLog);
                    }
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_windowLog, comprParams->windowLog);
+                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_hashLog, comprParams->hashLog);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_chainLog, comprParams->chainLog);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_searchLog, comprParams->searchLog);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_minMatch, comprParams->searchLength);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_targetLength, comprParams->targetLength);
                    ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionStrategy, comprParams->strategy);
                    ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize);
-                    do {
+
+                    if (!g_nbSeconds) nbCompressionLoops=1;
+                    for (nbLoops=0; nbLoops<nbCompressionLoops; nbLoops++) {
                        U32 blockNb;
                        for (blockNb=0; blockNb<nbBlocks; blockNb++) {
 #if 0   /* direct compression function, for occasional comparison */
@ -342,12 +352,16 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                            }
                            blockTable[blockNb].cSize = out.pos;
 #endif
+                    }   }
+                    {   U64 const loopDuration = UTIL_clockSpanNano(clockStart);
+                        if (loopDuration > 0) {
+                            if (loopDuration < fastestC * nbCompressionLoops)
+                                fastestC = loopDuration / nbCompressionLoops;
+                            nbCompressionLoops = (U32)(TIMELOOP_NANOSEC / fastestC) + 1;
+                        } else {
+                            assert(nbCompressionLoops < 40000000);  /* avoid overflow */
+                            nbCompressionLoops *= 100;
                        }
-                        nbLoops++;
-                    } while (UTIL_clockSpanMicro(clockStart) < clockLoop);
-                    {   U64 const loopDuration = UTIL_clockSpanMicro(clockStart);
-                        if (loopDuration < fastestC*nbLoops)
-                            fastestC = loopDuration / nbLoops;
                        totalCTime += loopDuration;
                        cCompleted = (totalCTime >= maxTime);  /* end compression tests */
                }   }
@ -357,16 +371,14 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                ratio = (double)srcSize / (double)cSize;
                markNb = (markNb+1) % NB_MARKS;
                {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                    double const compressionSpeed = (double)srcSize / fastestC;
+                    double const compressionSpeed = ((double)srcSize / fastestC) * 1000;
                    int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
                    DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r",
                            marks[markNb], displayName, (U32)srcSize, (U32)cSize,
                            ratioAccuracy, ratio,
                            cSpeedAccuracy, compressionSpeed );
                }
-            } else {   /* g_decodeOnly */
-                memcpy(compressedBuffer, srcBuffer, loadedCompressedSize);
-            }
+            }  /* if (!g_decodeOnly) */

 #if 0       /* disable decompression test */
            dCompleted=1;
@ -375,16 +387,16 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
            /* Decompression */
            if (!dCompleted) memset(resultBuffer, 0xD6, srcSize);  /* warm result buffer */

-            UTIL_sleepMilli(1); /* give processor time to other processes */
+            UTIL_sleepMilli(5); /* give processor time to other processes */
            UTIL_waitForNextTick();

            if (!dCompleted) {
-                U64 clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1;
                U32 nbLoops = 0;
                ZSTD_DDict* const ddict = ZSTD_createDDict(dictBuffer, dictBufferSize);
                UTIL_time_t const clockStart = UTIL_getTime();
                if (!ddict) EXM_THROW(2, "ZSTD_createDDict() allocation failure");
-                do {
+                if (!g_nbSeconds) nbDecodeLoops = 1;
+                for (nbLoops=0; nbLoops < nbDecodeLoops; nbLoops++) {
                    U32 blockNb;
                    for (blockNb=0; blockNb<nbBlocks; blockNb++) {
                        size_t const regenSize = ZSTD_decompress_usingDDict(dctx,
@ -396,22 +408,26 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                                      blockNb, (U32)blockTable[blockNb].cSize, ZSTD_getErrorName(regenSize));
                        }
                        blockTable[blockNb].resSize = regenSize;
-                    }
-                    nbLoops++;
-                } while (UTIL_clockSpanMicro(clockStart) < clockLoop);
+                }   }
                ZSTD_freeDDict(ddict);
-                {   U64 const loopDuration = UTIL_clockSpanMicro(clockStart);
-                    if (loopDuration < fastestD*nbLoops)
-                        fastestD = loopDuration / nbLoops;
+                {   U64 const loopDuration = UTIL_clockSpanNano(clockStart);
+                    if (loopDuration > 0) {
+                        if (loopDuration < fastestD * nbDecodeLoops)
+                            fastestD = loopDuration / nbDecodeLoops;
+                        nbDecodeLoops = (U32)(TIMELOOP_NANOSEC / fastestD) + 1;
+                    } else {
+                        assert(nbDecodeLoops < 40000000);  /* avoid overflow */
+                        nbDecodeLoops *= 100;
+                    }
                    totalDTime += loopDuration;
                    dCompleted = (totalDTime >= maxTime);
            }   }

            markNb = (markNb+1) % NB_MARKS;
            {   int const ratioAccuracy = (ratio < 10.) ? 3 : 2;
-                double const compressionSpeed = (double)srcSize / fastestC;
+                double const compressionSpeed = ((double)srcSize / fastestC) * 1000;
                int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1;
-                double const decompressionSpeed = (double)srcSize / fastestD;
+                double const decompressionSpeed = ((double)srcSize / fastestD) * 1000;
                DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r",
                        marks[markNb], displayName, (U32)srcSize, (U32)cSize,
                        ratioAccuracy, ratio,
@ -460,8 +476,8 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
        }   /* for (testNb = 1; testNb <= (g_nbSeconds + !g_nbSeconds); testNb++) */

        if (g_displayLevel == 1) {   /* hidden display mode -q, used by python speed benchmark */
-            double cSpeed = (double)srcSize / fastestC;
-            double dSpeed = (double)srcSize / fastestD;
+            double const cSpeed = ((double)srcSize / fastestC) * 1000;
+            double const dSpeed = ((double)srcSize / fastestD) * 1000;
            if (g_additionalParam)
                DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s  %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, g_additionalParam);
            else
@ -518,9 +534,8 @@ static void BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,
    if (g_displayLevel == 1 && !g_additionalParam)
        DISPLAY("bench %s %s: input %u bytes, %u seconds, %u KB blocks\n", ZSTD_VERSION_STRING, ZSTD_GIT_COMMIT_STRING, (U32)benchedSize, g_nbSeconds, (U32)(g_blockSize>>10));

-    if (cLevelLast < cLevel) cLevelLast = cLevel;
-
    for (l=cLevel; l <= cLevelLast; l++) {
+        if (l==0) continue;  /* skip level 0 */
        BMK_benchMem(srcBuffer, benchedSize,
                     displayName, l,
                     fileSizes, nbFiles,
@ -530,8 +545,8 @@ static void BMK_benchCLevel(const void* srcBuffer, size_t benchedSize,


 /*! BMK_loadFiles() :
-    Loads `buffer` with content of files listed within `fileNamesTable`.
-    At most, fills `buffer` entirely */
+ *  Loads `buffer` with content of files listed within `fileNamesTable`.
+ *  At most, fills `buffer` entirely. */
 static void BMK_loadFiles(void* buffer, size_t bufferSize,
                          size_t* fileSizes,
                          const char* const * const fileNamesTable, unsigned nbFiles)
@ -633,7 +648,8 @@ static void BMK_benchFileTable(const char* const * const fileNamesTable, unsigne
 }


-static void BMK_syntheticTest(int cLevel, int cLevelLast, double compressibility, const ZSTD_compressionParameters* compressionParams)
+static void BMK_syntheticTest(int cLevel, int cLevelLast, double compressibility,
+                              const ZSTD_compressionParameters* compressionParams)
 {
    char name[20] = {0};
    size_t benchedSize = 10000000;
@ -661,7 +677,6 @@ int BMK_benchFiles(const char** fileNamesTable, unsigned nbFiles,
 {
    double const compressibility = (double)g_compressibilityDefault / 100;

-    if (cLevel < 1) cLevel = 1;   /* minimum compression level */
    if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel();
    if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
    if (cLevelLast < cLevel) cLevelLast = cLevel;
--- a/sys/contrib/zstd/programs/bench.h
+++ b/sys/contrib/zstd/programs/bench.h
@ -22,7 +22,7 @@ int BMK_benchFiles(const char** fileNamesTable, unsigned nbFiles, const char* di
 /* Set Parameters */
 void BMK_setNbSeconds(unsigned nbLoops);
 void BMK_setBlockSize(size_t blockSize);
-void BMK_setNbThreads(unsigned nbThreads);
+void BMK_setNbWorkers(unsigned nbWorkers);
 void BMK_setRealTime(unsigned priority);
 void BMK_setNotificationLevel(unsigned level);
 void BMK_setSeparateFiles(unsigned separate);
--- a/sys/contrib/zstd/programs/fileio.c
+++ b/sys/contrib/zstd/programs/fileio.c
@ -36,18 +36,21 @@
 #  include <io.h>
 #endif

-#include "bitstream.h"
 #include "mem.h"
 #include "fileio.h"
 #include "util.h"
+
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
 #include "zstd.h"
+#include "zstd_errors.h"           /* ZSTD_error_frameParameter_windowTooLarge */
+
 #if defined(ZSTD_GZCOMPRESS) || defined(ZSTD_GZDECOMPRESS)
 #  include <zlib.h>
 #  if !defined(z_const)
 #    define z_const
 #  endif
 #endif
+
 #if defined(ZSTD_LZMACOMPRESS) || defined(ZSTD_LZMADECOMPRESS)
 #  include <lzma.h>
 #endif
@ -84,10 +87,15 @@ void FIO_setNotificationLevel(unsigned level) { g_displayLevel=level; }
 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;

-#define DISPLAYUPDATE(l, ...) { if (g_displayLevel>=l) { \
-            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (g_displayLevel>=4)) \
-            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stderr); } } }
+#define READY_FOR_UPDATE() (UTIL_clockSpanMicro(g_displayClock) > g_refreshRate)
+#define DELAY_NEXT_UPDATE() { g_displayClock = UTIL_getTime(); }
+#define DISPLAYUPDATE(l, ...) {                              \
+        if (g_displayLevel>=l) {                             \
+            if (READY_FOR_UPDATE() || (g_displayLevel>=4)) { \
+                DELAY_NEXT_UPDATE();                         \
+                DISPLAY(__VA_ARGS__);                        \
+                if (g_displayLevel>=4) fflush(stderr);       \
+    }   }   }

 #undef MIN  /* in case it would be already defined */
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
@ -139,7 +147,10 @@ static void INThandler(int sig)
 #if !defined(_MSC_VER)
    signal(sig, SIG_IGN);  /* this invocation generates a buggy warning in Visual Studio */
 #endif
-    if (g_artefact) remove(g_artefact);
+    if (g_artefact) {
+        assert(UTIL_isRegularFile(g_artefact));
+        remove(g_artefact);
+    }
    DISPLAY("\n");
    exit(2);
 }
@ -209,23 +220,23 @@ static U32 g_removeSrcFile = 0;
 void FIO_setRemoveSrcFile(unsigned flag) { g_removeSrcFile = (flag>0); }
 static U32 g_memLimit = 0;
 void FIO_setMemLimit(unsigned memLimit) { g_memLimit = memLimit; }
-static U32 g_nbThreads = 1;
-void FIO_setNbThreads(unsigned nbThreads) {
+static U32 g_nbWorkers = 1;
+void FIO_setNbWorkers(unsigned nbWorkers) {
 #ifndef ZSTD_MULTITHREAD
-    if (nbThreads > 1) DISPLAYLEVEL(2, "Note : multi-threading is disabled \n");
+    if (nbWorkers > 0) DISPLAYLEVEL(2, "Note : multi-threading is disabled \n");
 #endif
-    g_nbThreads = nbThreads;
+    g_nbWorkers = nbWorkers;
 }
 static U32 g_blockSize = 0;
 void FIO_setBlockSize(unsigned blockSize) {
-    if (blockSize && g_nbThreads==1)
+    if (blockSize && g_nbWorkers==0)
        DISPLAYLEVEL(2, "Setting block size is useless in single-thread mode \n");
    g_blockSize = blockSize;
 }
 #define FIO_OVERLAP_LOG_NOTSET 9999
 static U32 g_overlapLog = FIO_OVERLAP_LOG_NOTSET;
 void FIO_setOverlapLog(unsigned overlapLog){
-    if (overlapLog && g_nbThreads==1)
+    if (overlapLog && g_nbWorkers==0)
        DISPLAYLEVEL(2, "Setting overlapLog is useless in single-thread mode \n");
    g_overlapLog = overlapLog;
 }
@ -392,9 +403,9 @@ static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName)

 #ifndef ZSTD_NOCOMPRESS

-/*-**********************************************************************
-*  Compression
-************************************************************************/
+/* **********************************************************************
+ *  Compression
+ ************************************************************************/
 typedef struct {
    FILE* srcFile;
    FILE* dstFile;
@ -411,6 +422,7 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
    cRess_t ress;
    memset(&ress, 0, sizeof(ress));

+    DISPLAYLEVEL(6, "FIO_createCResources \n");
    ress.cctx = ZSTD_createCCtx();
    if (ress.cctx == NULL)
        EXM_THROW(30, "allocation error : can't create ZSTD_CCtx");
@ -421,7 +433,7 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
    if (!ress.srcBuffer || !ress.dstBuffer)
        EXM_THROW(31, "allocation error : not enough memory");

-    /* Advances parameters, including dictionary */
+    /* Advanced parameters, including dictionary */
    {   void* dictBuffer;
        size_t const dictBuffSize = FIO_createDictBuffer(&dictBuffer, dictFileName);   /* works with dictFileName==NULL */
        if (dictFileName && (dictBuffer==NULL))
@ -431,10 +443,9 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_dictIDFlag, g_dictIDFlag) );
        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_checksumFlag, g_checksumFlag) );
        /* compression level */
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionLevel, cLevel) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionLevel, (unsigned)cLevel) );
        /* long distance matching */
-        CHECK( ZSTD_CCtx_setParameter(
-                      ress.cctx, ZSTD_p_enableLongDistanceMatching, g_ldmFlag) );
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_enableLongDistanceMatching, g_ldmFlag) );
        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_ldmHashLog, g_ldmHashLog) );
        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_ldmMinMatch, g_ldmMinMatch) );
        if (g_ldmBucketSizeLog != FIO_LDM_PARAM_NOTSET) {
@ -452,10 +463,12 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_targetLength, comprParams->targetLength) );
        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_compressionStrategy, (U32)comprParams->strategy) );
        /* multi-threading */
-        DISPLAYLEVEL(5,"set nb threads = %u \n", g_nbThreads);
-        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_nbThreads, g_nbThreads) );
+#ifdef ZSTD_MULTITHREAD
+        DISPLAYLEVEL(5,"set nb workers = %u \n", g_nbWorkers);
+        CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_p_nbWorkers, g_nbWorkers) );
+#endif
        /* dictionary */
-        CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) );  /* just for dictionary loading, for compression parameters adaptation */
+        CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) );  /* set the value temporarily for dictionary loading, to adapt compression parameters */
        CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, dictBuffer, dictBuffSize) );
        CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, ZSTD_CONTENTSIZE_UNKNOWN) );  /* reset */

@ -726,19 +739,93 @@ static unsigned long long FIO_compressLz4Frame(cRess_t* ress,
 *  @return : 0 : compression completed correctly,
 *            1 : missing or pb opening srcFileName
 */
-static int FIO_compressFilename_internal(cRess_t ress,
-                                         const char* dstFileName, const char* srcFileName, int compressionLevel)
+static unsigned long long
+FIO_compressZstdFrame(const cRess_t* ressPtr,
+                      const char* srcFileName, U64 fileSize,
+                      int compressionLevel, U64* readsize)
 {
+    cRess_t const ress = *ressPtr;
    FILE* const srcFile = ress.srcFile;
    FILE* const dstFile = ress.dstFile;
+    U64 compressedfilesize = 0;
+    ZSTD_EndDirective directive = ZSTD_e_continue;
+    DISPLAYLEVEL(6, "compression using zstd format \n");
+
+    /* init */
+    if (fileSize != UTIL_FILESIZE_UNKNOWN)
+        ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize);
+    (void)compressionLevel; (void)srcFileName;
+
+    /* Main compression loop */
+    do {
+        size_t result;
+        /* Fill input Buffer */
+        size_t const inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile);
+        ZSTD_inBuffer inBuff = { ress.srcBuffer, inSize, 0 };
+        DISPLAYLEVEL(6, "fread %u bytes from source \n", (U32)inSize);
+        *readsize += inSize;
+
+        if ((inSize == 0) || (*readsize == fileSize))
+            directive = ZSTD_e_end;
+
+        result = 1;
+        while (inBuff.pos != inBuff.size || (directive == ZSTD_e_end && result != 0)) {
+            ZSTD_outBuffer outBuff = { ress.dstBuffer, ress.dstBufferSize, 0 };
+            CHECK_V(result, ZSTD_compress_generic(ress.cctx, &outBuff, &inBuff, directive));
+
+            /* Write compressed stream */
+            DISPLAYLEVEL(6, "ZSTD_compress_generic(end:%u) => intput pos(%u)<=(%u)size ; output generated %u bytes \n",
+                            (U32)directive, (U32)inBuff.pos, (U32)inBuff.size, (U32)outBuff.pos);
+            if (outBuff.pos) {
+                size_t const sizeCheck = fwrite(ress.dstBuffer, 1, outBuff.pos, dstFile);
+                if (sizeCheck!=outBuff.pos)
+                    EXM_THROW(25, "Write error : cannot write compressed block");
+                compressedfilesize += outBuff.pos;
+            }
+            if (READY_FOR_UPDATE()) {
+                ZSTD_frameProgression const zfp = ZSTD_getFrameProgression(ress.cctx);
+                double const cShare = (double)zfp.produced / (zfp.consumed + !zfp.consumed/*avoid div0*/) * 100;
+                if (g_displayLevel >= 3) {
+                    DISPLAYUPDATE(3, "\r(L%i) Buffered :%4u MB - Consumed :%4u MB - Compressed :%4u MB => %.2f%%",
+                                compressionLevel,
+                                (U32)((zfp.ingested - zfp.consumed) >> 20),
+                                (U32)(zfp.consumed >> 20),
+                                (U32)(zfp.produced >> 20),
+                                cShare );
+                } else {   /* g_displayLevel == 2 */
+                    DISPLAYLEVEL(2, "\rRead : %u ", (U32)(zfp.consumed >> 20));
+                    if (fileSize != UTIL_FILESIZE_UNKNOWN)
+                        DISPLAYLEVEL(2, "/ %u ", (U32)(fileSize >> 20));
+                    DISPLAYLEVEL(2, "MB ==> %2.f%% ", cShare);
+                    DELAY_NEXT_UPDATE();
+                }
+            }
+        }
+    } while (directive != ZSTD_e_end);
+
+    return compressedfilesize;
+}
+
+/*! FIO_compressFilename_internal() :
+ *  same as FIO_compressFilename_extRess(), with `ress.desFile` already opened.
+ *  @return : 0 : compression completed correctly,
+ *            1 : missing or pb opening srcFileName
+ */
+static int
+FIO_compressFilename_internal(cRess_t ress,
+                              const char* dstFileName, const char* srcFileName,
+                              int compressionLevel)
+{
    U64 readsize = 0;
    U64 compressedfilesize = 0;
    U64 const fileSize = UTIL_getFileSize(srcFileName);
-    ZSTD_EndDirective directive = ZSTD_e_continue;
    DISPLAYLEVEL(5, "%s: %u bytes \n", srcFileName, (U32)fileSize);

+    /* compression format selection */
    switch (g_compressionType) {
+        default:
        case FIO_zstdCompression:
+            compressedfilesize = FIO_compressZstdFrame(&ress, srcFileName, fileSize, compressionLevel, &readsize);
            break;

        case FIO_gzipCompression:
@ -749,7 +836,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
            EXM_THROW(20, "zstd: %s: file cannot be compressed as gzip (zstd compiled without ZSTD_GZCOMPRESS) -- ignored \n",
                            srcFileName);
 #endif
-            goto finish;
+            break;

        case FIO_xzCompression:
        case FIO_lzmaCompression:
@ -760,7 +847,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
            EXM_THROW(20, "zstd: %s: file cannot be compressed as xz/lzma (zstd compiled without ZSTD_LZMACOMPRESS) -- ignored \n",
                            srcFileName);
 #endif
-            goto finish;
+            break;

        case FIO_lz4Compression:
 #ifdef ZSTD_LZ4COMPRESS
@ -770,62 +857,14 @@ static int FIO_compressFilename_internal(cRess_t ress,
            EXM_THROW(20, "zstd: %s: file cannot be compressed as lz4 (zstd compiled without ZSTD_LZ4COMPRESS) -- ignored \n",
                            srcFileName);
 #endif
-            goto finish;
+            break;
    }

-    /* init */
-    if (fileSize != UTIL_FILESIZE_UNKNOWN)
-        ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize);
-
-    /* Main compression loop */
-    do {
-        size_t result;
-        /* Fill input Buffer */
-        size_t const inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile);
-        ZSTD_inBuffer inBuff = { ress.srcBuffer, inSize, 0 };
-        readsize += inSize;
-
-        if (inSize == 0 || (fileSize != UTIL_FILESIZE_UNKNOWN && readsize == fileSize))
-            directive = ZSTD_e_end;
-
-        result = 1;
-        while (inBuff.pos != inBuff.size || (directive == ZSTD_e_end && result != 0)) {
-            ZSTD_outBuffer outBuff = { ress.dstBuffer, ress.dstBufferSize, 0 };
-            CHECK_V(result, ZSTD_compress_generic(ress.cctx, &outBuff, &inBuff, directive));
-
-            /* Write compressed stream */
-            DISPLAYLEVEL(6, "ZSTD_compress_generic,ZSTD_e_continue: generated %u bytes \n",
-                            (U32)outBuff.pos);
-            if (outBuff.pos) {
-                size_t const sizeCheck = fwrite(ress.dstBuffer, 1, outBuff.pos, dstFile);
-                if (sizeCheck!=outBuff.pos)
-                    EXM_THROW(25, "Write error : cannot write compressed block into %s", dstFileName);
-                compressedfilesize += outBuff.pos;
-            }
-        }
-        if (g_nbThreads > 1) {
-            if (fileSize == UTIL_FILESIZE_UNKNOWN)
-                DISPLAYUPDATE(2, "\rRead : %u MB", (U32)(readsize>>20))
-            else
-                DISPLAYUPDATE(2, "\rRead : %u / %u MB",
-                                    (U32)(readsize>>20), (U32)(fileSize>>20));
-        } else {
-            if (fileSize == UTIL_FILESIZE_UNKNOWN)
-                DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%",
-                                (U32)(readsize>>20),
-                                (double)compressedfilesize/readsize*100)
-            else
-                DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%",
-                                (U32)(readsize>>20), (U32)(fileSize>>20),
-                                (double)compressedfilesize/readsize*100);
-        }
-    } while (directive != ZSTD_e_end);
-
-finish:
    /* Status */
    DISPLAYLEVEL(2, "\r%79s\r", "");
-    DISPLAYLEVEL(2,"%-20s :%6.2f%%   (%6llu => %6llu bytes, %s) \n", srcFileName,
-        (double)compressedfilesize/(readsize+(!readsize) /* avoid div by zero */ )*100,
+    DISPLAYLEVEL(2,"%-20s :%6.2f%%   (%6llu => %6llu bytes, %s) \n",
+        srcFileName,
+        (double)compressedfilesize / (readsize+(!readsize)/*avoid div by zero*/) * 100,
        (unsigned long long)readsize, (unsigned long long) compressedfilesize,
         dstFileName);

@ -861,7 +900,7 @@ static int FIO_compressFilename_srcFile(cRess_t ress,
         * delete both the source and destination files.
         */
        clearHandler();
-        if (remove(srcFileName))
+        if (FIO_remove(srcFileName))
            EXM_THROW(1, "zstd: %s: %s", srcFileName, strerror(errno));
    }
    return result;
@ -881,6 +920,7 @@ static int FIO_compressFilename_dstFile(cRess_t ress,
    stat_t statbuf;
    int stat_result = 0;

+    DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s", dstFileName);
    ress.dstFile = FIO_openDstFile(dstFileName);
    if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
    /* Must ony be added after FIO_openDstFile() succeeds.
@ -898,11 +938,13 @@ static int FIO_compressFilename_dstFile(cRess_t ress,
        DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
        result=1;
    }
-    if (result!=0) {  /* remove operation artefact */
-        if (remove(dstFileName))
-            EXM_THROW(1, "zstd: %s: %s", dstFileName, strerror(errno));
-    }
-    else if (strcmp (dstFileName, stdoutmark) && stat_result)
+    if ( (result != 0)  /* operation failure */
+      && strcmp(dstFileName, nulmark)      /* special case : don't remove() /dev/null */
+      && strcmp(dstFileName, stdoutmark) ) /* special case : don't remove() stdout */
+        FIO_remove(dstFileName); /* remove compression artefact; note don't do anything special if remove() fails */
+    else if ( strcmp(dstFileName, stdoutmark)
+           && strcmp(dstFileName, nulmark)
+           && stat_result)
        UTIL_setFileStat(dstFileName, &statbuf);

    return result;
@ -951,10 +993,14 @@ int FIO_compressMultipleFilenames(const char** inFileNamesTable, unsigned nbFile
    if (outFileName != NULL) {
        unsigned u;
        ress.dstFile = FIO_openDstFile(outFileName);
-        for (u=0; u<nbFiles; u++)
-            missed_files += FIO_compressFilename_srcFile(ress, outFileName, inFileNamesTable[u], compressionLevel);
-        if (fclose(ress.dstFile))
-            EXM_THROW(29, "Write error : cannot properly close stdout");
+        if (ress.dstFile==NULL) {  /* could not open outFileName */
+            missed_files = nbFiles;
+        } else {
+            for (u=0; u<nbFiles; u++)
+                missed_files += FIO_compressFilename_srcFile(ress, outFileName, inFileNamesTable[u], compressionLevel);
+            if (fclose(ress.dstFile))
+                EXM_THROW(29, "Write error : cannot properly close stdout");
+        }
    } else {
        unsigned u;
        for (u=0; u<nbFiles; u++) {
@ -1134,33 +1180,46 @@ static unsigned FIO_passThrough(FILE* foutput, FILE* finput, void* buffer, size_
    return 0;
 }

-static void FIO_zstdErrorHelp(dRess_t* ress, size_t ret, char const* srcFileName)
+/* FIO_highbit64() :
+ * gives position of highest bit.
+ * note : only works for v > 0 !
+ */
+static unsigned FIO_highbit64(unsigned long long v)
+{
+    unsigned count = 0;
+    assert(v != 0);
+    v >>= 1;
+    while (v) { v >>= 1; count++; }
+    return count;
+}
+
+/* FIO_zstdErrorHelp() :
+ * detailed error message when requested window size is too large */
+static void FIO_zstdErrorHelp(dRess_t* ress, size_t err, char const* srcFileName)
 {
    ZSTD_frameHeader header;
-    /* No special help for these errors */
-    if (ZSTD_getErrorCode(ret) != ZSTD_error_frameParameter_windowTooLarge)
+
+    /* Help message only for one specific error */
+    if (ZSTD_getErrorCode(err) != ZSTD_error_frameParameter_windowTooLarge)
        return;
+
    /* Try to decode the frame header */
-    ret = ZSTD_getFrameHeader(&header, ress->srcBuffer, ress->srcBufferLoaded);
-    if (ret == 0) {
-        U32 const windowSize = (U32)header.windowSize;
-        U32 const windowLog = BIT_highbit32(windowSize) + ((windowSize & (windowSize - 1)) != 0);
-        U32 const windowMB = (windowSize >> 20) + ((windowSize & ((1 MB) - 1)) != 0);
-        assert(header.windowSize <= (U64)((U32)-1));
+    err = ZSTD_getFrameHeader(&header, ress->srcBuffer, ress->srcBufferLoaded);
+    if (err == 0) {
+        unsigned long long const windowSize = header.windowSize;
+        U32 const windowLog = FIO_highbit64(windowSize) + ((windowSize & (windowSize - 1)) != 0);
+        U32 const windowMB = (U32)((windowSize >> 20) + ((windowSize & ((1 MB) - 1)) != 0));
+        assert(windowSize < (U64)(1ULL << 52));
        assert(g_memLimit > 0);
        DISPLAYLEVEL(1, "%s : Window size larger than maximum : %llu > %u\n",
-                        srcFileName, header.windowSize, g_memLimit);
+                        srcFileName, windowSize, g_memLimit);
        if (windowLog <= ZSTD_WINDOWLOG_MAX) {
            DISPLAYLEVEL(1, "%s : Use --long=%u or --memory=%uMB\n",
                            srcFileName, windowLog, windowMB);
            return;
        }
-    } else if (ZSTD_getErrorCode(ret) != ZSTD_error_frameParameter_windowTooLarge) {
-        DISPLAYLEVEL(1, "%s : Error decoding frame header to read window size : %s\n",
-                        srcFileName, ZSTD_getErrorName(ret));
-        return;
    }
-    DISPLAYLEVEL(1, "%s : Window log larger than ZSTD_WINDOWLOG_MAX=%u not supported\n",
+    DISPLAYLEVEL(1, "%s : Window log larger than ZSTD_WINDOWLOG_MAX=%u; not supported\n",
                    srcFileName, ZSTD_WINDOWLOG_MAX);
 }

@ -1571,7 +1630,7 @@ static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const ch
         * delete both the source and destination files.
         */
        clearHandler();
-        if (remove(srcFileName)) {
+        if (FIO_remove(srcFileName)) {
            /* failed to remove src file */
            DISPLAYLEVEL(1, "zstd: %s: %s \n", srcFileName, strerror(errno));
            return 1;
@ -1614,7 +1673,7 @@ static int FIO_decompressDstFile(dRess_t ress,
    if ( (result != 0)  /* operation failure */
      && strcmp(dstFileName, nulmark)      /* special case : don't remove() /dev/null (#316) */
      && strcmp(dstFileName, stdoutmark) ) /* special case : don't remove() stdout */
-        remove(dstFileName);  /* remove decompression artefact; note don't do anything special if remove() fails */
+        FIO_remove(dstFileName);  /* remove decompression artefact; note don't do anything special if remove() fails */
    else {  /* operation success */
        if ( strcmp(dstFileName, stdoutmark) /* special case : don't chmod stdout */
          && strcmp(dstFileName, nulmark)    /* special case : don't chmod /dev/null */
@ -1917,6 +1976,7 @@ static void displayInfo(const char* inFileName, const fileInfo_t* info, int disp
 static fileInfo_t FIO_addFInfo(fileInfo_t fi1, fileInfo_t fi2)
 {
    fileInfo_t total;
+    memset(&total, 0, sizeof(total));
    total.numActualFrames = fi1.numActualFrames + fi2.numActualFrames;
    total.numSkippableFrames = fi1.numSkippableFrames + fi2.numSkippableFrames;
    total.compressedSize = fi1.compressedSize + fi2.compressedSize;
--- a/sys/contrib/zstd/programs/fileio.h
+++ b/sys/contrib/zstd/programs/fileio.h
@ -54,7 +54,7 @@ void FIO_setDictIDFlag(unsigned dictIDFlag);
 void FIO_setChecksumFlag(unsigned checksumFlag);
 void FIO_setRemoveSrcFile(unsigned flag);
 void FIO_setMemLimit(unsigned memLimit);
-void FIO_setNbThreads(unsigned nbThreads);
+void FIO_setNbWorkers(unsigned nbWorkers);
 void FIO_setBlockSize(unsigned blockSize);
 void FIO_setOverlapLog(unsigned overlapLog);
 void FIO_setLdmFlag(unsigned ldmFlag);
--- a/sys/contrib/zstd/programs/platform.h
+++ b/sys/contrib/zstd/programs/platform.h
@ -71,7 +71,7 @@ extern "C" {
 ***************************************************************/
 #if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)) /* UNIX-like OS */ \
   || defined(__midipix__) || defined(__VMS))
-#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1–2001 (SUSv3) conformant */ \
+#  if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
     || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)  /* BSD distros */
 #    define PLATFORM_POSIX_VERSION 200112L
 #  else
@ -106,8 +106,7 @@ extern "C" {
 #  include <io.h>      /* _isatty */
 #  include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
 #  include <stdio.h>   /* FILE */
-static __inline int IS_CONSOLE(FILE* stdStream)
-{
+static __inline int IS_CONSOLE(FILE* stdStream) {
    DWORD dummy;
    return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy);
 }
--- a/sys/contrib/zstd/programs/util.h
+++ b/sys/contrib/zstd/programs/util.h
@ -142,7 +142,9 @@ static int g_utilDisplayLevel;
        }
        return 1000000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart;
    }
+
 #elif defined(__APPLE__) && defined(__MACH__)
+
    #include <mach/mach_time.h>
    #define UTIL_TIME_INITIALIZER 0
    typedef U64 UTIL_time_t;
@ -167,7 +169,9 @@ static int g_utilDisplayLevel;
        }
        return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom);
    }
+
 #elif (PLATFORM_POSIX_VERSION >= 200112L) && (defined __UCLIBC__ || ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) || __GLIBC__ > 2))
+
    #define UTIL_TIME_INITIALIZER { 0, 0 }
    typedef struct timespec UTIL_freq_t;
    typedef struct timespec UTIL_time_t;
@ -217,12 +221,18 @@ static int g_utilDisplayLevel;
 #define SEC_TO_MICRO 1000000

 /* returns time span in microseconds */
-UTIL_STATIC U64 UTIL_clockSpanMicro( UTIL_time_t clockStart )
+UTIL_STATIC U64 UTIL_clockSpanMicro(UTIL_time_t clockStart )
 {
    UTIL_time_t const clockEnd = UTIL_getTime();
    return UTIL_getSpanTimeMicro(clockStart, clockEnd);
 }

+/* returns time span in microseconds */
+UTIL_STATIC U64 UTIL_clockSpanNano(UTIL_time_t clockStart )
+{
+    UTIL_time_t const clockEnd = UTIL_getTime();
+    return UTIL_getSpanTimeNano(clockStart, clockEnd);
+}

 UTIL_STATIC void UTIL_waitForNextTick(void)
 {
@ -246,11 +256,17 @@ UTIL_STATIC void UTIL_waitForNextTick(void)
 #endif


+UTIL_STATIC int UTIL_isRegularFile(const char* infilename);
+
+
 UTIL_STATIC int UTIL_setFileStat(const char *filename, stat_t *statbuf)
 {
    int res = 0;
    struct utimbuf timebuf;

+    if (!UTIL_isRegularFile(filename))
+        return -1;
+
    timebuf.actime = time(NULL);
    timebuf.modtime = statbuf->st_mtime;
    res += utime(filename, &timebuf);  /* set access and modification times */
--- a/sys/contrib/zstd/programs/zstd.1
+++ b/sys/contrib/zstd/programs/zstd.1
@ -1,5 +1,5 @@
 .
-.TH "ZSTD" "1" "December 2017" "zstd 1.3.3" "User Commands"
+.TH "ZSTD" "1" "2018-01-27" "zstd 1.3.4" "User Commands"
 .
 .SH "NAME"
 \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
@ -136,7 +136,7 @@ force write to standard output, even if it is the console
 .
 .TP
 \fB\-\-[no\-]sparse\fR
-enable / disable sparse FS support, to make files with many zeroes smaller on disk\. Creating sparse files may save disk space and speed up decompression by reducing the amount of disk I/O\. default : enabled when output is into a file, and disabled when output is stdout\. This setting overrides default and can force sparse mode over stdout\.
+enable / disable sparse FS support, to make files with many zeroes smaller on disk\. Creating sparse files may save disk space and speed up decompression by reducing the amount of disk I/O\. default: enabled when output is into a file, and disabled when output is stdout\. This setting overrides default and can force sparse mode over stdout\.
 .
 .TP
 \fB\-\-rm\fR
@ -172,14 +172,14 @@ suppress warnings, interactivity, and notifications\. specify twice to suppress
 .
 .TP
 \fB\-C\fR, \fB\-\-[no\-]check\fR
-add integrity check computed from uncompressed data (default : enabled)
+add integrity check computed from uncompressed data (default: enabled)
 .
 .TP
 \fB\-\-\fR
 All arguments after \fB\-\-\fR are treated as files
 .
 .SH "DICTIONARY BUILDER"
-\fBzstd\fR offers \fIdictionary\fR compression, useful for very small files and messages\. It\'s possible to train \fBzstd\fR with some samples, the result of which is saved into a file called a \fBdictionary\fR\. Then during compression and decompression, reference the same dictionary\. It will improve compression ratio of small files\. Typical gains range from 10% (at 64KB) to x5 better (at <1KB)\.
+\fBzstd\fR offers \fIdictionary\fR compression, which greatly improves efficiency on small files and messages\. It\'s possible to train \fBzstd\fR with a set of samples, the result of which is saved into a file called a \fBdictionary\fR\. Then during compression and decompression, reference the same dictionary, using command \fB\-D dictionaryFileName\fR\. Compression of small files similar to the sample set will be greatly improved\.
 .
 .TP
 \fB\-\-train FILEs\fR
@ -197,6 +197,10 @@ Dictionary saved into \fBfile\fR (default name: dictionary)\.
 Limit dictionary to specified size (default: 112640)\.
 .
 .TP
+\fB\-#\fR
+Use \fB#\fR compression level during training (optional)\. Will generate statistics more tuned for selected compression level, resulting in a \fIsmall\fR compression ratio improvement for this level\.
+.
+.TP
 \fB\-B#\fR
 Split input files in blocks of size # (default: no split)
 .
@ -251,7 +255,7 @@ benchmark file(s) using multiple compression levels, from \fB\-b#\fR to \fB\-e#\
 .
 .TP
 \fB\-i#\fR
-minimum evaluation time, in seconds (default : 3s), benchmark mode only
+minimum evaluation time, in seconds (default: 3s), benchmark mode only
 .
 .TP
 \fB\-B#\fR, \fB\-\-block\-size=#\fR
--- a/sys/contrib/zstd/programs/zstd.1.md
+++ b/sys/contrib/zstd/programs/zstd.1.md
@ -115,11 +115,25 @@ the last one takes effect.

    Note: If `windowLog` is set to larger than 27, `--long=windowLog` or
    `--memory=windowSize` needs to be passed to the decompressor.
+* `--fast[=#]`:
+    switch to ultra-fast compression levels.
+    If `=#` is not present, it defaults to `1`.
+    The higher the value, the faster the compression speed,
+    at the cost of some compression ratio.
+    This setting overwrites compression level if one was set previously.
+    Similarly, if a compression level is set after `--fast`, it overrides it.
+
 * `-T#`, `--threads=#`:
-    Compress using `#` threads (default: 1).
+    Compress using `#` working threads (default: 1).
    If `#` is 0, attempt to detect and use the number of physical CPU cores.
-    In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==256.
+    In all cases, the nb of threads is capped to ZSTDMT_NBTHREADS_MAX==200.
    This modifier does nothing if `zstd` is compiled without multithread support.
+* `--single-thread`:
+    Does not spawn a thread for compression, use caller thread instead.
+    This is the only available mode when multithread support is disabled.
+    In this mode, compression is serialized with I/O.
+    (This is different from `-T1`, which spawns 1 compression thread in parallel of I/O).
+    Single-thread mode also features lower memory usage.
 * `-D file`:
    use `file` as Dictionary to compress or decompress FILE(s)
 * `--nodictID`:
@ -137,7 +151,7 @@ the last one takes effect.
    to make files with many zeroes smaller on disk.
    Creating sparse files may save disk space and speed up decompression by
    reducing the amount of disk I/O.
-    default : enabled when output is into a file,
+    default: enabled when output is into a file,
    and disabled when output is stdout.
    This setting overrides default and can force sparse mode over stdout.
 * `--rm`:
@ -163,7 +177,7 @@ the last one takes effect.
    suppress warnings, interactivity, and notifications.
    specify twice to suppress errors too.
 * `-C`, `--[no-]check`:
-    add integrity check computed from uncompressed data (default : enabled)
+    add integrity check computed from uncompressed data (default: enabled)
 * `--`:
    All arguments after `--` are treated as files

@ -171,12 +185,12 @@ the last one takes effect.
 DICTIONARY BUILDER
 ------------------
 `zstd` offers _dictionary_ compression,
-useful for very small files and messages.
-It's possible to train `zstd` with some samples,
+which greatly improves efficiency on small files and messages.
+It's possible to train `zstd` with a set of samples,
 the result of which is saved into a file called a `dictionary`.
-Then during compression and decompression, reference the same dictionary.
-It will improve compression ratio of small files.
-Typical gains range from 10% (at 64KB) to x5 better (at <1KB).
+Then during compression and decompression, reference the same dictionary,
+using command `-D dictionaryFileName`.
+Compression of small files similar to the sample set will be greatly improved.

 * `--train FILEs`:
    Use FILEs as training set to create a dictionary.
@ -192,6 +206,10 @@ Typical gains range from 10% (at 64KB) to x5 better (at <1KB).
    Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
    Limit dictionary to specified size (default: 112640).
+* `-#`:
+    Use `#` compression level during training (optional).
+    Will generate statistics more tuned for selected compression level,
+    resulting in a _small_ compression ratio improvement for this level.
 * `-B#`:
    Split input files in blocks of size # (default: no split)
 * `--dictID=#`:
@ -252,7 +270,7 @@ BENCHMARK
 * `-e#`:
    benchmark file(s) using multiple compression levels, from `-b#` to `-e#` (inclusive)
 * `-i#`:
-    minimum evaluation time, in seconds (default : 3s), benchmark mode only
+    minimum evaluation time, in seconds (default: 3s), benchmark mode only
 * `-B#`, `--block-size=#`:
    cut file(s) into independent blocks of size # (default: no block)
 * `--priority=rt`:
@ -329,14 +347,21 @@ The list of available _options_:
    The minimum _slen_ is 3 and the maximum is 7.

 - `targetLen`=_tlen_, `tlen`=_tlen_:
-    Specify the minimum match length that causes a match finder to stop
-    searching for better matches.
+    The impact of this field vary depending on selected strategy.

-    A larger minimum match length usually improves compression ratio but
-    decreases compression speed.
-    This option is only used with strategies ZSTD_btopt and ZSTD_btultra.
+    For ZSTD\_btopt and ZSTD\_btultra, it specifies the minimum match length
+    that causes match finder to stop searching for better matches.
+    A larger `targetLen` usually improves compression ratio
+    but decreases compression speed.

-    The minimum _tlen_ is 4 and the maximum is 999.
+    For ZSTD\_fast, it specifies
+    the amount of data skipped between match sampling.
+    Impact is reversed : a larger `targetLen` increases compression speed
+    but decreases compression ratio.
+
+    For all other strategies, this field has no impact.
+
+    The minimum _tlen_ is 1 and the maximum is 999.

 - `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
    Determine `overlapSize`, amount of data reloaded from previous job.
--- a/sys/contrib/zstd/programs/zstdcli.c
+++ b/sys/contrib/zstd/programs/zstdcli.c
@ -54,6 +54,7 @@
 #define ZSTD_ZSTDMT "zstdmt"
 #define ZSTD_UNZSTD "unzstd"
 #define ZSTD_CAT "zstdcat"
+#define ZSTD_ZCAT "zcat"
 #define ZSTD_GZ "gzip"
 #define ZSTD_GUNZIP "gunzip"
 #define ZSTD_GZCAT "gzcat"
@ -105,7 +106,7 @@ static int usage(const char* programName)
    DISPLAY( "          with no FILE, or when FILE is - , read standard input\n");
    DISPLAY( "Arguments : \n");
 #ifndef ZSTD_NOCOMPRESS
-    DISPLAY( " -#     : # compression level (1-%d, default:%d) \n", ZSTDCLI_CLEVEL_MAX, ZSTDCLI_CLEVEL_DEFAULT);
+    DISPLAY( " -#     : # compression level (1-%d, default: %d) \n", ZSTDCLI_CLEVEL_MAX, ZSTDCLI_CLEVEL_DEFAULT);
 #endif
 #ifndef ZSTD_NODECOMPRESS
    DISPLAY( " -d     : decompression \n");
@ -132,13 +133,14 @@ static int usage_advanced(const char* programName)
    DISPLAY( " -l     : print information about zstd compressed files \n");
 #ifndef ZSTD_NOCOMPRESS
    DISPLAY( "--ultra : enable levels beyond %i, up to %i (requires more memory)\n", ZSTDCLI_CLEVEL_MAX, ZSTD_maxCLevel());
-    DISPLAY( "--long[=#]  : enable long distance matching with given window log (default : %u)\n", g_defaultMaxWindowLog);
+    DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog);
+    DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1);
 #ifdef ZSTD_MULTITHREAD
-    DISPLAY( " -T#    : use # threads for compression (default:1) \n");
-    DISPLAY( " -B#    : select size of each job (default:0==automatic) \n");
+    DISPLAY( " -T#    : spawns # compression threads (default: 1, 0==# cores) \n");
+    DISPLAY( " -B#    : select size of each job (default: 0==automatic) \n");
 #endif
    DISPLAY( "--no-dictID : don't write dictID into header (dictionary compression)\n");
-    DISPLAY( "--[no-]check : integrity check (default:enabled) \n");
+    DISPLAY( "--[no-]check : integrity check (default: enabled) \n");
 #endif
 #ifdef UTIL_HAS_CREATEFILELIST
    DISPLAY( " -r     : operate recursively on directories \n");
@ -156,9 +158,9 @@ static int usage_advanced(const char* programName)
 #ifndef ZSTD_NODECOMPRESS
    DISPLAY( "--test  : test compressed file integrity \n");
 #if ZSTD_SPARSE_DEFAULT
-    DISPLAY( "--[no-]sparse : sparse mode (default:enabled on file, disabled on stdout)\n");
+    DISPLAY( "--[no-]sparse : sparse mode (default: enabled on file, disabled on stdout)\n");
 #else
-    DISPLAY( "--[no-]sparse : sparse mode (default:disabled)\n");
+    DISPLAY( "--[no-]sparse : sparse mode (default: disabled)\n");
 #endif
 #endif
    DISPLAY( " -M#    : Set a memory usage limit for decompression \n");
@ -170,15 +172,15 @@ static int usage_advanced(const char* programName)
    DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n");
    DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
    DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
-    DISPLAY( "--maxdict=# : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
+    DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
    DISPLAY( "--dictID=# : force dictionary ID to specified value (default: random)\n");
 #endif
 #ifndef ZSTD_NOBENCH
    DISPLAY( "\n");
    DISPLAY( "Benchmark arguments : \n");
-    DISPLAY( " -b#    : benchmark file(s), using # compression level (default : 1) \n");
+    DISPLAY( " -b#    : benchmark file(s), using # compression level (default: %d) \n", ZSTDCLI_CLEVEL_DEFAULT);
    DISPLAY( " -e#    : test all compression levels from -bX to # (default: 1)\n");
-    DISPLAY( " -i#    : minimum evaluation time in seconds (default : 3s) \n");
+    DISPLAY( " -i#    : minimum evaluation time in seconds (default: 3s) \n");
    DISPLAY( " -B#    : cut file into independent blocks of size # (default: no block)\n");
    DISPLAY( "--priority=rt : set process priority to real-time \n");
 #endif
@ -218,10 +220,10 @@ static int exeNameMatch(const char* exeName, const char* test)
 }

 /*! readU32FromChar() :
-    @return : unsigned integer value read from input in `char` format
-    allows and interprets K, KB, KiB, M, MB and MiB suffix.
-    Will also modify `*stringPtr`, advancing it to position where it stopped reading.
-    Note : function result can overflow if digit string > MAX_UINT */
+ * @return : unsigned integer value read from input in `char` format.
+ *  allows and interprets K, KB, KiB, M, MB and MiB suffix.
+ *  Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+ *  Note : function result can overflow if digit string > MAX_UINT */
 static unsigned readU32FromChar(const char** stringPtr)
 {
    unsigned result = 0;
@ -240,7 +242,7 @@ static unsigned readU32FromChar(const char** stringPtr)
 /** longCommandWArg() :
 *  check if *stringPtr is the same as longCommand.
 *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- *  @return 0 and doesn't modify *stringPtr otherwise.
+ * @return 0 and doesn't modify *stringPtr otherwise.
 */
 static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
 {
@ -318,12 +320,13 @@ static unsigned parseCompressionParameters(const char* stringPtr, ZSTD_compressi
        if (longCommandWArg(&stringPtr, "ldmSearchLength=") || longCommandWArg(&stringPtr, "ldmslen=")) { g_ldmMinMatch = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
        if (longCommandWArg(&stringPtr, "ldmBucketSizeLog=") || longCommandWArg(&stringPtr, "ldmblog=")) { g_ldmBucketSizeLog = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
        if (longCommandWArg(&stringPtr, "ldmHashEveryLog=") || longCommandWArg(&stringPtr, "ldmhevery=")) { g_ldmHashEveryLog = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
+        DISPLAYLEVEL(4, "invalid compression parameter \n");
        return 0;
    }

+    DISPLAYLEVEL(4, "windowLog=%d, chainLog=%d, hashLog=%d, searchLog=%d \n", params->windowLog, params->chainLog, params->hashLog, params->searchLog);
+    DISPLAYLEVEL(4, "searchLength=%d, targetLength=%d, strategy=%d \n", params->searchLength, params->targetLength, params->strategy);
    if (stringPtr[0] != 0) return 0; /* check the end of string */
-    DISPLAYLEVEL(4, "windowLog=%d\nchainLog=%d\nhashLog=%d\nsearchLog=%d\n", params->windowLog, params->chainLog, params->hashLog, params->searchLog);
-    DISPLAYLEVEL(4, "searchLength=%d\ntargetLength=%d\nstrategy=%d\n", params->searchLength, params->targetLength, params->strategy);
    return 1;
 }

@ -364,27 +367,28 @@ typedef enum { zom_compress, zom_decompress, zom_test, zom_bench, zom_train, zom
 int main(int argCount, const char* argv[])
 {
    int argNb,
-        forceStdout=0,
-        followLinks=0,
-        main_pause=0,
-        nextEntryIsDictionary=0,
-        operationResult=0,
-        nextArgumentIsOutFileName=0,
-        nextArgumentIsMaxDict=0,
-        nextArgumentIsDictID=0,
-        nextArgumentsAreFiles=0,
-        ultra=0,
+        followLinks = 0,
+        forceStdout = 0,
        lastCommand = 0,
-        nbThreads = 1,
-        setRealTimePrio = 0,
+        ldmFlag = 0,
+        main_pause = 0,
+        nbWorkers = 0,
+        nextArgumentIsOutFileName = 0,
+        nextArgumentIsMaxDict = 0,
+        nextArgumentIsDictID = 0,
+        nextArgumentsAreFiles = 0,
+        nextEntryIsDictionary = 0,
+        operationResult = 0,
        separateFiles = 0,
-        ldmFlag = 0;
+        setRealTimePrio = 0,
+        singleThread = 0,
+        ultra=0;
    unsigned bench_nbSeconds = 3;   /* would be better if this value was synchronized from bench */
    size_t blockSize = 0;
    zstd_operation_mode operation = zom_compress;
    ZSTD_compressionParameters compressionParams;
    int cLevel = ZSTDCLI_CLEVEL_DEFAULT;
-    int cLevelLast = 1;
+    int cLevelLast = -1000000000;
    unsigned recursive = 0;
    unsigned memLimit = 0;
    const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));   /* argCount >= 1 */
@ -416,22 +420,25 @@ int main(int argCount, const char* argv[])
    if (filenameTable==NULL) { DISPLAY("zstd: %s \n", strerror(errno)); exit(1); }
    filenameTable[0] = stdinmark;
    g_displayOut = stderr;
-
    programName = lastNameFromPath(programName);
+#ifdef ZSTD_MULTITHREAD
+    nbWorkers = 1;
+#endif

    /* preset behaviors */
-    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbThreads=0;
+    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbWorkers=0;
    if (exeNameMatch(programName, ZSTD_UNZSTD)) operation=zom_decompress;
-    if (exeNameMatch(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }
-    if (exeNameMatch(programName, ZSTD_GZ)) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); FIO_setRemoveSrcFile(1); }    /* behave like gzip */
-    if (exeNameMatch(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                          /* behave like gunzip */
-    if (exeNameMatch(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like gzcat */
-    if (exeNameMatch(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like lzma */
-    if (exeNameMatch(programName, ZSTD_UNLZMA)) { operation=zom_decompress; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like unlzma */
-    if (exeNameMatch(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
-    if (exeNameMatch(programName, ZSTD_UNXZ)) { operation=zom_decompress; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like unxz */
-    if (exeNameMatch(programName, ZSTD_LZ4)) { suffix = LZ4_EXTENSION; FIO_setCompressionType(FIO_lz4Compression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
-    if (exeNameMatch(programName, ZSTD_UNLZ4)) { operation=zom_decompress; FIO_setCompressionType(FIO_lz4Compression); FIO_setRemoveSrcFile(1); }    /* behave like unxz */
+    if (exeNameMatch(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }   /* supports multiple formats */
+    if (exeNameMatch(programName, ZSTD_ZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like zcat, also supports multiple formats */
+    if (exeNameMatch(programName, ZSTD_GZ)) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); FIO_setRemoveSrcFile(1); }               /* behave like gzip */
+    if (exeNameMatch(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                                     /* behave like gunzip, also supports multiple formats */
+    if (exeNameMatch(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; } /* behave like gzcat, also supports multiple formats */
+    if (exeNameMatch(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }           /* behave like lzma */
+    if (exeNameMatch(programName, ZSTD_UNLZMA)) { operation=zom_decompress; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }        /* behave like unlzma, also supports multiple formats */
+    if (exeNameMatch(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }                 /* behave like xz */
+    if (exeNameMatch(programName, ZSTD_UNXZ)) { operation=zom_decompress; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }            /* behave like unxz, also supports multiple formats */
+    if (exeNameMatch(programName, ZSTD_LZ4)) { suffix = LZ4_EXTENSION; FIO_setCompressionType(FIO_lz4Compression); }                                       /* behave like lz4 */
+    if (exeNameMatch(programName, ZSTD_UNLZ4)) { operation=zom_decompress; FIO_setCompressionType(FIO_lz4Compression); }                                   /* behave like unlz4, also supports multiple formats */
    memset(&compressionParams, 0, sizeof(compressionParams));

    /* command switches */
@ -478,6 +485,7 @@ int main(int argCount, const char* argv[])
                    if (!strcmp(argument, "--keep")) { FIO_setRemoveSrcFile(0); continue; }
                    if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
                    if (!strcmp(argument, "--priority=rt")) { setRealTimePrio = 1; continue; }
+                    if (!strcmp(argument, "--single-thread")) { nbWorkers = 0; singleThread = 1; continue; }
 #ifdef ZSTD_GZCOMPRESS
                    if (!strcmp(argument, "--format=gzip")) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); continue; }
 #endif
@ -512,7 +520,7 @@ int main(int argCount, const char* argv[])
                      continue;
                    }
 #endif
-                    if (longCommandWArg(&argument, "--threads=")) { nbThreads = readU32FromChar(&argument); continue; }
+                    if (longCommandWArg(&argument, "--threads=")) { nbWorkers = readU32FromChar(&argument); continue; }
                    if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
                    if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
                    if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
@ -536,6 +544,21 @@ int main(int argCount, const char* argv[])
                            compressionParams.windowLog = ldmWindowLog;
                        continue;
                    }
+                    if (longCommandWArg(&argument, "--fast")) {
+                        /* Parse optional window log */
+                        if (*argument == '=') {
+                            U32 fastLevel;
+                            ++argument;
+                            fastLevel = readU32FromChar(&argument);
+                            if (fastLevel) cLevel = - (int)fastLevel;
+                        } else if (*argument != 0) {
+                            /* Invalid character following --fast */
+                            CLEAN_RETURN(badusage(programName));
+                        } else {
+                            cLevel = -1;  /* default for --fast */
+                        }
+                        continue;
+                    }
                    /* fall-through, will trigger bad_usage() later on */
                }

@ -566,7 +589,8 @@ int main(int argCount, const char* argv[])
                         /* Decoding */
                    case 'd':
 #ifndef ZSTD_NOBENCH
-                            if (operation==zom_bench) { BMK_setDecodeOnlyMode(1); argument++; break; }  /* benchmark decode (hidden option) */
+                            BMK_setDecodeOnlyMode(1);
+                            if (operation==zom_bench) { argument++; break; }  /* benchmark decode (hidden option) */
 #endif
                            operation=zom_decompress; argument++; break;

@ -645,7 +669,7 @@ int main(int argCount, const char* argv[])
                        /* nb of threads (hidden option) */
                    case 'T':
                        argument++;
-                        nbThreads = readU32FromChar(&argument);
+                        nbWorkers = readU32FromChar(&argument);
                        break;

                        /* Dictionary Selection level */
@ -713,11 +737,13 @@ int main(int argCount, const char* argv[])
    /* Welcome message (if verbose) */
    DISPLAYLEVEL(3, WELCOME_MESSAGE);

-    if (nbThreads == 0) {
-        /* try to guess */
-        nbThreads = UTIL_countPhysicalCores();
-        DISPLAYLEVEL(3, "Note: %d physical core(s) detected \n", nbThreads);
+#ifdef ZSTD_MULTITHREAD
+    if ((nbWorkers==0) && (!singleThread)) {
+        /* automatically set # workers based on # of reported cpus */
+        nbWorkers = UTIL_countPhysicalCores();
+        DISPLAYLEVEL(3, "Note: %d physical core(s) detected \n", nbWorkers);
    }
+#endif

    g_utilDisplayLevel = g_displayLevel;
    if (!followLinks) {
@ -760,7 +786,7 @@ int main(int argCount, const char* argv[])
        BMK_setNotificationLevel(g_displayLevel);
        BMK_setSeparateFiles(separateFiles);
        BMK_setBlockSize(blockSize);
-        BMK_setNbThreads(nbThreads);
+        BMK_setNbWorkers(nbWorkers);
        BMK_setRealTime(setRealTimePrio);
        BMK_setNbSeconds(bench_nbSeconds);
        BMK_setLdmFlag(ldmFlag);
@ -788,7 +814,7 @@ int main(int argCount, const char* argv[])
        zParams.dictID = dictID;
        if (cover) {
            int const optimize = !coverParams.k || !coverParams.d;
-            coverParams.nbThreads = nbThreads;
+            coverParams.nbThreads = nbWorkers;
            coverParams.zParams = zParams;
            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, blockSize, NULL, &coverParams, optimize);
        } else {
@ -803,16 +829,22 @@ int main(int argCount, const char* argv[])
    }

 #ifndef ZSTD_NODECOMPRESS
-    if (operation==zom_test) { outFileName=nulmark; FIO_setRemoveSrcFile(0); } /* test mode */
+    if (operation==zom_test) { outFileName=nulmark; FIO_setRemoveSrcFile(0); }  /* test mode */
 #endif

    /* No input filename ==> use stdin and stdout */
    filenameIdx += !filenameIdx;   /* filenameTable[0] is stdin by default */
-    if (!strcmp(filenameTable[0], stdinmark) && !outFileName) outFileName = stdoutmark;   /* when input is stdin, default output is stdout */
+    if (!strcmp(filenameTable[0], stdinmark) && !outFileName)
+        outFileName = stdoutmark;  /* when input is stdin, default output is stdout */

    /* Check if input/output defined as console; trigger an error in this case */
-    if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) CLEAN_RETURN(badusage(programName));
-    if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && !strcmp(filenameTable[0], stdinmark) && !forceStdout && operation!=zom_decompress)
+    if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) )
+        CLEAN_RETURN(badusage(programName));
+    if ( outFileName && !strcmp(outFileName, stdoutmark)
+      && IS_CONSOLE(stdout)
+      && !strcmp(filenameTable[0], stdinmark)
+      && !forceStdout
+      && operation!=zom_decompress )
        CLEAN_RETURN(badusage(programName));

 #ifndef ZSTD_NOCOMPRESS
@ -832,7 +864,7 @@ int main(int argCount, const char* argv[])
    FIO_setNotificationLevel(g_displayLevel);
    if (operation==zom_compress) {
 #ifndef ZSTD_NOCOMPRESS
-        FIO_setNbThreads(nbThreads);
+        FIO_setNbWorkers(nbWorkers);
        FIO_setBlockSize((U32)blockSize);
        FIO_setLdmFlag(ldmFlag);
        FIO_setLdmHashLog(g_ldmHashLog);
--- a/sys/contrib/zstd/tests/.gitignore
+++ b/sys/contrib/zstd/tests/.gitignore
@ -23,6 +23,9 @@ decodecorpus
 pool
 poolTests
 invalidDictionaries
+checkTag
+zcat
+zstdcat

 # Tmp test directory
 zstdtest
@ -52,6 +55,7 @@ tmp*
 *.gz
 result
 out
+*.zstd

 # fuzzer
 afl
--- a/sys/contrib/zstd/tests/Makefile
+++ b/sys/contrib/zstd/tests/Makefile
@ -21,7 +21,7 @@
 ZSTDDIR = ../lib
 PRGDIR  = ../programs
 PYTHON ?= python3
-TESTARTEFACT := versionsTest namespaceTest
+TESTARTEFACT := versionsTest

 DEBUGLEVEL ?= 1
 DEBUGFLAGS  = -g -DZSTD_DEBUG=$(DEBUGLEVEL)
@ -44,6 +44,16 @@ ZSTD_FILES  := $(ZSTDDECOMP_FILES) $(ZSTDCOMMON_FILES) $(ZSTDCOMP_FILES)
 ZBUFF_FILES := $(ZSTDDIR)/deprecated/*.c
 ZDICT_FILES := $(ZSTDDIR)/dictBuilder/*.c

+ZSTD_F1 := $(wildcard $(ZSTD_FILES))
+ZSTD_OBJ1 := $(subst $(ZSTDDIR)/common/,zstdm_,$(ZSTD_F1))
+ZSTD_OBJ2 := $(subst $(ZSTDDIR)/compress/,zstdc_,$(ZSTD_OBJ1))
+ZSTD_OBJ3 := $(subst $(ZSTDDIR)/decompress/,zstdd_,$(ZSTD_OBJ2))
+ZSTD_OBJECTS := $(ZSTD_OBJ3:.c=.o)
+
+ZSTDMT_OBJ1 := $(subst $(ZSTDDIR)/common/,zstdmt_m_,$(ZSTD_F1))
+ZSTDMT_OBJ2 := $(subst $(ZSTDDIR)/compress/,zstdmt_c_,$(ZSTDMT_OBJ1))
+ZSTDMT_OBJ3 := $(subst $(ZSTDDIR)/decompress/,zstdmt_d_,$(ZSTDMT_OBJ2))
+ZSTDMT_OBJECTS := $(ZSTDMT_OBJ3:.c=.o)

 # Define *.exe as extension for Windows systems
 ifneq (,$(filter Windows%,$(OS)))
@ -63,14 +73,17 @@ FUZZERTEST ?= -T200s
 ZSTDRTTEST = --test-large-data
 DECODECORPUS_TESTTIME ?= -T30

-.PHONY: default all all32 allnothread dll clean test test32 test-all namespaceTest versionsTest
+.PHONY: default all all32 allnothread dll clean test test32 test-all versionsTest

 default: fullbench
+	@echo $(ZSTDMT_OBJECTS)

-all: fullbench fuzzer zstreamtest paramgrill datagen decodecorpus
+all: fullbench fuzzer zstreamtest paramgrill datagen decodecorpus roundTripCrash

 all32: fullbench32 fuzzer32 zstreamtest32

+allnothread: MULTITHREAD_CPP=
+allnothread: MULTITHREAD_LD=
 allnothread: fullbench fuzzer paramgrill datagen decodecorpus

 dll: fuzzer-dll zstreamtest-dll
@ -85,37 +98,71 @@ zstd-nolegacy:
 	$(MAKE) -C $(PRGDIR) $@ MOREFLAGS+="$(DEBUGFLAGS)"

 gzstd:
-	$(MAKE) -C $(PRGDIR) zstd HAVE_ZLIB=1 MOREFLAGS="$(DEBUGFLAGS)"
+	$(MAKE) -C $(PRGDIR) zstd HAVE_ZLIB=1 MOREFLAGS+="$(DEBUGFLAGS)"
+
+.PHONY: zstd-dll
+zstd-dll :
+	$(MAKE) -C $(ZSTDDIR) libzstd
+
+.PHONY: zstd-staticLib
+zstd-staticLib :
+	$(MAKE) -C $(ZSTDDIR) libzstd.a
+
+zstdm_%.o : $(ZSTDDIR)/common/%.c
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+zstdc_%.o : $(ZSTDDIR)/compress/%.c
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+zstdd_%.o : $(ZSTDDIR)/decompress/%.c
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+zstdmt%.o : CPPFLAGS += $(MULTITHREAD_CPP)
+
+zstdmt_m_%.o : $(ZSTDDIR)/common/%.c
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+zstdmt_c_%.o : $(ZSTDDIR)/compress/%.c
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+zstdmt_d_%.o : $(ZSTDDIR)/decompress/%.c
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@

 fullbench32: CPPFLAGS += -m32
 fullbench fullbench32 : CPPFLAGS += $(MULTITHREAD_CPP)
 fullbench fullbench32 : LDFLAGS += $(MULTITHREAD_LD)
 fullbench fullbench32 : DEBUGFLAGS =   # turn off assert() for speed measurements
-fullbench fullbench32 : $(ZSTD_FILES) $(PRGDIR)/datagen.c fullbench.c
+fullbench fullbench32 : $(ZSTD_FILES)
+fullbench fullbench32 : $(PRGDIR)/datagen.c fullbench.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)

-fullbench-lib: $(PRGDIR)/datagen.c fullbench.c
-	$(MAKE) -C $(ZSTDDIR) libzstd.a
-	$(CC) $(FLAGS) $^ -o $@$(EXT) $(ZSTDDIR)/libzstd.a
+fullbench-lib : zstd-staticLib
+fullbench-lib : $(PRGDIR)/datagen.c fullbench.c
+	$(CC) $(FLAGS) $(filter %.c,$^) -o $@$(EXT) $(ZSTDDIR)/libzstd.a

+# note : broken : requires unavailable symbols
+fullbench-dll : zstd-dll
+fullbench-dll : LDFLAGS+= -L$(ZSTDDIR) -lzstd
 fullbench-dll: $(PRGDIR)/datagen.c fullbench.c
-	$(MAKE) -C $(ZSTDDIR) libzstd
-	$(CC) $(FLAGS) $^ -o $@$(EXT) -DZSTD_DLL_IMPORT=1 $(ZSTDDIR)/dll/libzstd.dll
+#	$(CC) $(FLAGS) $(filter %.c,$^) -o $@$(EXT) -DZSTD_DLL_IMPORT=1 $(ZSTDDIR)/dll/libzstd.dll
+	$(CC) $(FLAGS) $(filter %.c,$^) -o $@$(EXT)

-fuzzer : CPPFLAGS += $(MULTITHREAD_CPP)
-fuzzer : LDFLAGS += $(MULTITHREAD_LD)
+fuzzer  : CPPFLAGS += $(MULTITHREAD_CPP)
+fuzzer  : LDFLAGS += $(MULTITHREAD_LD)
 fuzzer32: CFLAGS += -m32
-fuzzer fuzzer32 : $(ZSTD_FILES) $(ZDICT_FILES) $(PRGDIR)/datagen.c fuzzer.c
+fuzzer  : $(ZSTDMT_OBJECTS)
+fuzzer32: $(ZSTD_FILES)
+fuzzer fuzzer32 : $(ZDICT_FILES) $(PRGDIR)/datagen.c fuzzer.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)

+fuzzer-dll : zstd-dll
 fuzzer-dll : LDFLAGS+= -L$(ZSTDDIR) -lzstd
 fuzzer-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/datagen.c fuzzer.c
-	$(MAKE) -C $(ZSTDDIR) libzstd
-	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@$(EXT)
+	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)

 zbufftest : CPPFLAGS += -I$(ZSTDDIR)/deprecated
 zbufftest : CFLAGS += -Wno-deprecated-declarations   # required to silence deprecation warnings
-zbufftest : $(ZSTD_FILES) $(ZBUFF_FILES) $(PRGDIR)/datagen.c zbufftest.c
+zbufftest : $(ZSTD_OBJECTS) $(ZBUFF_FILES) $(PRGDIR)/datagen.c zbufftest.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)

 zbufftest32 : CPPFLAGS += -I$(ZSTDDIR)/deprecated
@ -123,18 +170,22 @@ zbufftest32 : CFLAGS += -Wno-deprecated-declarations -m32
 zbufftest32 : $(ZSTD_FILES) $(ZBUFF_FILES) $(PRGDIR)/datagen.c zbufftest.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)

+zbufftest-dll : zstd-dll
 zbufftest-dll : CPPFLAGS += -I$(ZSTDDIR)/deprecated
 zbufftest-dll : CFLAGS += -Wno-deprecated-declarations   # required to silence deprecation warnings
 zbufftest-dll : LDFLAGS+= -L$(ZSTDDIR) -lzstd
 zbufftest-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/datagen.c zbufftest.c
-	$(MAKE) -C $(ZSTDDIR) libzstd
-	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@$(EXT)
+	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)

-ZSTREAMFILES := $(ZSTD_FILES) $(ZDICT_FILES) $(PRGDIR)/datagen.c seqgen.c zstreamtest.c
+ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c seqgen.c zstreamtest.c
+ZSTREAM_PROPER_FILES := $(ZDICT_FILES) $(ZSTREAM_LOCAL_FILES)
+ZSTREAMFILES := $(ZSTD_FILES) $(ZSTREAM_PROPER_FILES)
 zstreamtest32 : CFLAGS += -m32
 zstreamtest zstreamtest32 : CPPFLAGS += $(MULTITHREAD_CPP)
 zstreamtest zstreamtest32 : LDFLAGS += $(MULTITHREAD_LD)
-zstreamtest zstreamtest32 : $(ZSTREAMFILES)
+zstreamtest : $(ZSTDMT_OBJECTS) $(ZSTREAM_PROPER_FILES)
+zstreamtest32 : $(ZSTREAMFILES)
+zstreamtest zstreamtest32 :
 	$(CC) $(FLAGS) $^ -o $@$(EXT)

 zstreamtest_asan : CFLAGS += -fsanitize=address
@ -145,54 +196,53 @@ zstreamtest_tsan : CFLAGS += -fsanitize=thread
 zstreamtest_tsan : $(ZSTREAMFILES)
 	$(CC) $(FLAGS) $(MULTITHREAD) $^ -o $@$(EXT)

+zstreamtest-dll : zstd-dll
 zstreamtest-dll : LDFLAGS+= -L$(ZSTDDIR) -lzstd
-zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/datagen.c zstreamtest.c
-	$(MAKE) -C $(ZSTDDIR) libzstd
-	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@$(EXT)
+zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c  # xxh symbols not exposed from dll
+zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
+	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)

-paramgrill : DEBUGFLAGS =
+paramgrill : DEBUGFLAGS =   # turn off assert() for speed measurements
 paramgrill : $(ZSTD_FILES) $(PRGDIR)/datagen.c paramgrill.c
-	$(CC)      $(FLAGS) $^ -lm -o $@$(EXT)
+	$(CC) $(FLAGS) $^ -lm -o $@$(EXT)

 datagen : $(PRGDIR)/datagen.c datagencli.c
-	$(CC)      $(FLAGS) $^ -o $@$(EXT)
+	$(CC) $(FLAGS) $^ -o $@$(EXT)

-roundTripCrash : $(ZSTD_FILES) roundTripCrash.c
-	$(CC)      $(FLAGS) $(MULTITHREAD) $^ -o $@$(EXT)
+roundTripCrash : $(ZSTD_OBJECTS) roundTripCrash.c
+	$(CC) $(FLAGS) $(MULTITHREAD) $^ -o $@$(EXT)

-longmatch  : $(ZSTD_FILES) longmatch.c
-	$(CC)      $(FLAGS) $^ -o $@$(EXT)
+longmatch  : $(ZSTD_OBJECTS) longmatch.c
+	$(CC) $(FLAGS) $^ -o $@$(EXT)

-invalidDictionaries  : $(ZSTD_FILES) invalidDictionaries.c
-	$(CC)      $(FLAGS) $^ -o $@$(EXT)
+invalidDictionaries : $(ZSTD_OBJECTS) invalidDictionaries.c
+	$(CC) $(FLAGS) $^ -o $@$(EXT)

-legacy : CFLAGS+= -DZSTD_LEGACY_SUPPORT=4
-legacy : CPPFLAGS+= -I$(ZSTDDIR)/legacy
+legacy : CPPFLAGS += -I$(ZSTDDIR)/legacy -DZSTD_LEGACY_SUPPORT=4
 legacy : $(ZSTD_FILES) $(wildcard $(ZSTDDIR)/legacy/*.c) legacy.c
-	$(CC)      $(FLAGS) $^ -o $@$(EXT)
+	$(CC) $(FLAGS) $^ -o $@$(EXT)

-decodecorpus	: $(filter-out $(ZSTDDIR)/compress/zstd_compress.c, $(wildcard $(ZSTD_FILES))) $(ZDICT_FILES) decodecorpus.c
-	$(CC)      $(FLAGS) $^ -o $@$(EXT) -lm
+decodecorpus : $(filter-out zstdc_zstd_compress.o, $(ZSTD_OBJECTS)) $(ZDICT_FILES) decodecorpus.c
+	$(CC) $(FLAGS) $^ -o $@$(EXT) -lm

-symbols  : symbols.c
-	$(MAKE) -C $(ZSTDDIR) libzstd
+symbols  : symbols.c zstd-dll
 ifneq (,$(filter Windows%,$(OS)))
 	cp $(ZSTDDIR)/dll/libzstd.dll .
-	$(CC) $(FLAGS) $^ -o $@$(EXT) -DZSTD_DLL_IMPORT=1 libzstd.dll
+	$(CC) $(FLAGS) $< -o $@$(EXT) -DZSTD_DLL_IMPORT=1 libzstd.dll
 else
-	$(CC) $(FLAGS) $^ -o $@$(EXT) -Wl,-rpath=$(ZSTDDIR) $(ZSTDDIR)/libzstd.so
+	$(CC) $(FLAGS) $< -o $@$(EXT) -Wl,-rpath=$(ZSTDDIR) $(ZSTDDIR)/libzstd.so   # broken on Mac
 endif

 poolTests : poolTests.c $(ZSTDDIR)/common/pool.c $(ZSTDDIR)/common/threading.c $(ZSTDDIR)/common/zstd_common.c $(ZSTDDIR)/common/error_private.c
 	$(CC) $(FLAGS) $(MULTITHREAD) $^ -o $@$(EXT)

-namespaceTest:
-	if $(CC) namespaceTest.c ../lib/common/xxhash.c -o $@ ; then echo compilation should fail; exit 1 ; fi
-	$(RM) $@
-
+.PHONY: versionsTest
 versionsTest: clean
 	$(PYTHON) test-zstd-versions.py

+checkTag: checkTag.c $(ZSTDDIR)/zstd.h
+	$(CC) $(FLAGS) $< -o $@$(EXT)
+
 clean:
 	$(MAKE) -C $(ZSTDDIR) clean
 	@$(RM) -fR $(TESTARTEFACT)
@ -205,7 +255,7 @@ clean:
        zstreamtest$(EXT) zstreamtest32$(EXT) \
        datagen$(EXT) paramgrill$(EXT) roundTripCrash$(EXT) longmatch$(EXT) \
        symbols$(EXT) invalidDictionaries$(EXT) legacy$(EXT) poolTests$(EXT) \
-        decodecorpus$(EXT)
+        decodecorpus$(EXT) checkTag$(EXT)
 	@echo Cleaning completed


@ -247,15 +297,23 @@ ifneq (,$(filter $(shell uname),SunOS))
 DIFF:=gdiff
 endif

+.PHONY: list
+list:
+	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+
+.PHONY: zstd-playTests
 zstd-playTests: datagen
 	file $(ZSTD)
 	ZSTD="$(QEMU_SYS) $(ZSTD)" ./playTests.sh $(ZSTDRTTEST)

+.PHONY: shortest
 shortest: ZSTDRTTEST=
 shortest: test-zstd

+.PHONY: fuzztest
 fuzztest: test-fuzzer test-zstream test-decodecorpus

+.PHONY: test
 test: test-zstd test-fullbench test-fuzzer test-zstream test-invalidDictionaries test-legacy test-decodecorpus
 ifeq ($(QEMU_SYS),)
 test: test-pool
--- a/sys/contrib/zstd/tests/checkTag.c
+++ b/sys/contrib/zstd/tests/checkTag.c
@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* checkTag : validation tool for libzstd
+ * command :
+ * $ ./checkTag tag
+ * checkTag validates tags of following format : v[0-9].[0-9].[0-9]{any}
+ * The tag is then compared to zstd version number.
+ * They are compatible if first 3 digits are identical.
+ * Anything beyond that is free, and doesn't impact validation.
+ * Example : tag v1.8.1.2 is compatible with version 1.8.1
+ * When tag and version are not compatible, program exits with error code 1.
+ * When they are compatible, it exists with a code 0.
+ * checkTag is intended to be used in automated testing environment.
+ */
+
+#include <stdio.h>   /* printf */
+#include <string.h>  /* strlen, strncmp */
+#include "zstd.h"    /* ZSTD_VERSION_STRING */
+
+
+/*  validate() :
+ * @return 1 if tag is compatible, 0 if not.
+ */
+static int validate(const char* const tag)
+{
+    size_t const tagLength = strlen(tag);
+    size_t const verLength = strlen(ZSTD_VERSION_STRING);
+
+    if (tagLength < 2) return 0;
+    if (tag[0] != 'v') return 0;
+    if (tagLength <= verLength) return 0;
+
+    if (strncmp(ZSTD_VERSION_STRING, tag+1, verLength)) return 0;
+
+    return 1;
+}
+
+int main(int argc, const char** argv)
+{
+    const char* const exeName = argv[0];
+    const char* const tag = argv[1];
+    if (argc!=2) {
+        printf("incorrect usage : %s tag \n", exeName);
+        return 2;
+    }
+
+    printf("Version : %s \n", ZSTD_VERSION_STRING);
+    printf("Tag     : %s \n", tag);
+
+    if (validate(tag)) {
+        printf("OK : tag is compatible with zstd version \n");
+        return 0;
+    }
+
+    printf("!! error : tag and versions are not compatible !! \n");
+    return 1;
+}
--- a/sys/contrib/zstd/tests/fullbench.c
+++ b/sys/contrib/zstd/tests/fullbench.c
@ -15,6 +15,7 @@
 #include "util.h"        /* Compiler options, UTIL_GetFileSize */
 #include <stdlib.h>      /* malloc */
 #include <stdio.h>       /* fprintf, fopen, ftello64 */
+#include <assert.h>      /* assert */

 #include "mem.h"         /* U32 */
 #ifndef ZSTD_DLL_IMPORT
@ -181,7 +182,7 @@ static size_t local_ZSTD_compress_generic_T2_end(void* dst, size_t dstCapacity,
    ZSTD_inBuffer buffIn;
    (void)buff2;
    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbThreads, 2);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbWorkers, 2);
    buffOut.dst = dst;
    buffOut.size = dstCapacity;
    buffOut.pos = 0;
@ -198,7 +199,7 @@ static size_t local_ZSTD_compress_generic_T2_continue(void* dst, size_t dstCapac
    ZSTD_inBuffer buffIn;
    (void)buff2;
    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_compressionLevel, 1);
-    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbThreads, 2);
+    ZSTD_CCtx_setParameter(g_cstream, ZSTD_p_nbWorkers, 2);
    buffOut.dst = dst;
    buffOut.size = dstCapacity;
    buffOut.pos = 0;
@ -413,33 +414,48 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
        break;

    /* test functions */
-    /* by convention, test functions can be added > 100 */
+    /* convention: test functions have ID > 100 */

    default : ;
    }

-    { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }     /* warming up memory */
+     /* warming up memory */
+    { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }

+    /* benchmark loop */
    {   U32 loopNb;
-#       define TIME_SEC_MICROSEC     (1*1000000ULL) /* 1 second */
-        U64 const clockLoop = TIMELOOP_S * TIME_SEC_MICROSEC;
+        U32 nbRounds = (U32)((50 MB) / (srcSize+1)) + 1;   /* initial conservative speed estimate */
+#       define TIME_SEC_MICROSEC    (1*1000000ULL) /* 1 second */
+#       define TIME_SEC_NANOSEC     (1*1000000000ULL) /* 1 second */
        DISPLAY("%2i- %-30.30s : \r", benchNb, benchName);
        for (loopNb = 1; loopNb <= g_nbIterations; loopNb++) {
            UTIL_time_t clockStart;
            size_t benchResult=0;
-            U32 nbRounds;
+            U32 roundNb;

-            UTIL_sleepMilli(1);  /* give processor time to other processes */
+            UTIL_sleepMilli(5);  /* give processor time to other processes */
            UTIL_waitForNextTick();
            clockStart = UTIL_getTime();
-            for (nbRounds=0; UTIL_clockSpanMicro(clockStart) < clockLoop; nbRounds++) {
+            for (roundNb=0; roundNb < nbRounds; roundNb++) {
                benchResult = benchFunction(dstBuff, dstBuffSize, buff2, src, srcSize);
-                if (ZSTD_isError(benchResult)) { DISPLAY("ERROR ! %s() => %s !! \n", benchName, ZSTD_getErrorName(benchResult)); exit(1); }
-            }
-            {   U64 const clockSpanMicro = UTIL_clockSpanMicro(clockStart);
-                double const averageTime = (double)clockSpanMicro / TIME_SEC_MICROSEC / nbRounds;
-                if (averageTime < bestTime) bestTime = averageTime;
-                DISPLAY("%2i- %-30.30s : %7.1f MB/s  (%9u)\r", loopNb, benchName, (double)srcSize / (1 MB) / bestTime, (U32)benchResult);
+                if (ZSTD_isError(benchResult)) {
+                    DISPLAY("ERROR ! %s() => %s !! \n", benchName, ZSTD_getErrorName(benchResult));
+                    exit(1);
+            }   }
+            {   U64 const clockSpanNano = UTIL_clockSpanNano(clockStart);
+                double const averageTime = (double)clockSpanNano / TIME_SEC_NANOSEC / nbRounds;
+                if (clockSpanNano > 0) {
+                    if (averageTime < bestTime) bestTime = averageTime;
+                    assert(bestTime > (1./2000000000));
+                    nbRounds = (U32)(1. / bestTime);   /* aim for 1 sec */
+                    DISPLAY("%2i- %-30.30s : %7.1f MB/s  (%9u)\r",
+                            loopNb, benchName,
+                            (double)srcSize / (1 MB) / bestTime,
+                            (U32)benchResult);
+                } else {
+                    assert(nbRounds < 40000000);  /* avoid overflow */
+                    nbRounds *= 100;
+                }
    }   }   }
    DISPLAY("%2u\n", benchNb);

@ -573,7 +589,7 @@ int main(int argc, const char** argv)

    for(i=1; i<argc; i++) {
        const char* argument = argv[i];
-        if(!argument) continue;   /* Protection if argument empty */
+        assert(argument != NULL);

        /* Commands (note : aggregated commands are allowed) */
        if (argument[0]=='-') {
--- a/sys/contrib/zstd/tests/fuzz/zstd_helpers.c
+++ b/sys/contrib/zstd/tests/fuzz/zstd_helpers.c
@ -35,7 +35,7 @@ ZSTD_compressionParameters FUZZ_randomCParams(size_t srcSize, uint32_t *state)
    cParams.searchLength = FUZZ_rand32(state, ZSTD_SEARCHLENGTH_MIN,
                                              ZSTD_SEARCHLENGTH_MAX);
    cParams.targetLength = FUZZ_rand32(state, ZSTD_TARGETLENGTH_MIN,
-                                              ZSTD_TARGETLENGTH_MAX);
+                                              512);
    cParams.strategy = FUZZ_rand32(state, ZSTD_fast, ZSTD_btultra);
    return ZSTD_adjustCParams(cParams, srcSize, 0);
 }
--- a/sys/contrib/zstd/tests/fuzzer.c
+++ b/sys/contrib/zstd/tests/fuzzer.c
--- a/sys/contrib/zstd/tests/legacy.c
+++ b/sys/contrib/zstd/tests/legacy.c
@ -94,21 +94,19 @@ int testStreamingAPI(void)
        if (needsInit) {
            size_t const ret = ZSTD_initDStream(stream);
            if (ZSTD_isError(ret)) {
-                DISPLAY("ERROR: %s\n", ZSTD_getErrorName(ret));
+                DISPLAY("ERROR: ZSTD_initDStream: %s\n", ZSTD_getErrorName(ret));
                return 1;
-            }
-        }
-        {
-            size_t const ret = ZSTD_decompressStream(stream, &output, &input);
+        }   }
+
+        {   size_t const ret = ZSTD_decompressStream(stream, &output, &input);
            if (ZSTD_isError(ret)) {
-                DISPLAY("ERROR: %s\n", ZSTD_getErrorName(ret));
+                DISPLAY("ERROR: ZSTD_decompressStream: %s\n", ZSTD_getErrorName(ret));
                return 1;
            }

            if (ret == 0) {
                needsInit = 1;
-            }
-        }
+        }   }

        if (memcmp(outBuff, EXPECTED + outputPos, output.pos) != 0) {
            DISPLAY("ERROR: Wrong decoded output produced\n");
@ -128,15 +126,12 @@ int testStreamingAPI(void)

 int main(void)
 {
-    int ret;
-
-    ret = testSimpleAPI();
-    if (ret) return ret;
-    ret = testStreamingAPI();
-    if (ret) return ret;
+    {   int const ret = testSimpleAPI();
+        if (ret) return ret; }
+    {   int const ret = testStreamingAPI();
+        if (ret) return ret; }

    DISPLAY("OK\n");
-
    return 0;
 }

--- a/sys/contrib/zstd/tests/paramgrill.c
+++ b/sys/contrib/zstd/tests/paramgrill.c
@ -547,12 +547,12 @@ static ZSTD_compressionParameters randomParams(void)
    U32 validated = 0;
    while (!validated) {
        /* totally random entry */
-        p.chainLog   = FUZ_rand(&g_rand) % (ZSTD_CHAINLOG_MAX+1 - ZSTD_CHAINLOG_MIN) + ZSTD_CHAINLOG_MIN;
-        p.hashLog    = FUZ_rand(&g_rand) % (ZSTD_HASHLOG_MAX+1 - ZSTD_HASHLOG_MIN) + ZSTD_HASHLOG_MIN;
-        p.searchLog  = FUZ_rand(&g_rand) % (ZSTD_SEARCHLOG_MAX+1 - ZSTD_SEARCHLOG_MIN) + ZSTD_SEARCHLOG_MIN;
-        p.windowLog  = FUZ_rand(&g_rand) % (ZSTD_WINDOWLOG_MAX+1 - ZSTD_WINDOWLOG_MIN) + ZSTD_WINDOWLOG_MIN;
-        p.searchLength=FUZ_rand(&g_rand) % (ZSTD_SEARCHLENGTH_MAX+1 - ZSTD_SEARCHLENGTH_MIN) + ZSTD_SEARCHLENGTH_MIN;
-        p.targetLength=FUZ_rand(&g_rand) % (ZSTD_TARGETLENGTH_MAX+1 - ZSTD_TARGETLENGTH_MIN) + ZSTD_TARGETLENGTH_MIN;
+        p.chainLog   = (FUZ_rand(&g_rand) % (ZSTD_CHAINLOG_MAX+1 - ZSTD_CHAINLOG_MIN)) + ZSTD_CHAINLOG_MIN;
+        p.hashLog    = (FUZ_rand(&g_rand) % (ZSTD_HASHLOG_MAX+1 - ZSTD_HASHLOG_MIN)) + ZSTD_HASHLOG_MIN;
+        p.searchLog  = (FUZ_rand(&g_rand) % (ZSTD_SEARCHLOG_MAX+1 - ZSTD_SEARCHLOG_MIN)) + ZSTD_SEARCHLOG_MIN;
+        p.windowLog  = (FUZ_rand(&g_rand) % (ZSTD_WINDOWLOG_MAX+1 - ZSTD_WINDOWLOG_MIN)) + ZSTD_WINDOWLOG_MIN;
+        p.searchLength=(FUZ_rand(&g_rand) % (ZSTD_SEARCHLENGTH_MAX+1 - ZSTD_SEARCHLENGTH_MIN)) + ZSTD_SEARCHLENGTH_MIN;
+        p.targetLength=(FUZ_rand(&g_rand) % (512)) + ZSTD_TARGETLENGTH_MIN;
        p.strategy   = (ZSTD_strategy) (FUZ_rand(&g_rand) % (ZSTD_btultra +1));
        validated = !ZSTD_isError(ZSTD_checkCParams(p));
    }
--- a/sys/contrib/zstd/tests/playTests.sh
+++ b/sys/contrib/zstd/tests/playTests.sh
@ -93,6 +93,7 @@ else
    hasMT="true"
 fi

+
 $ECHO "\n===>  simple tests "

 ./datagen > tmp
@ -100,8 +101,12 @@ $ECHO "test : basic compression "
 $ZSTD -f tmp                      # trivial compression case, creates tmp.zst
 $ECHO "test : basic decompression"
 $ZSTD -df tmp.zst                 # trivial decompression case (overwrites tmp)
-$ECHO "test : too large compression level (must fail)"
+$ECHO "test : too large compression level => auto-fix"
 $ZSTD -99 -f tmp  # too large compression level, automatic sized down
+$ECHO "test : --fast aka negative compression levels"
+$ZSTD --fast -f tmp  # == -1
+$ZSTD --fast=3 -f tmp  # == -3
+$ZSTD --fast=200000 -f tmp  # == no compression
 $ECHO "test : compress to stdout"
 $ZSTD tmp -c > tmpCompressed
 $ZSTD tmp --stdout > tmpCompressed       # long command format
@ -190,10 +195,16 @@ $ZSTD -t tmp1.zst tmp2.zst
 $ZSTD -dc tmp1.zst tmp2.zst
 $ZSTD tmp1.zst tmp2.zst -o "$INTOVOID"
 $ZSTD -d tmp1.zst tmp2.zst -o tmp
+touch tmpexists
+$ZSTD tmp1 tmp2 -f -o tmpexists
+$ZSTD tmp1 tmp2 -o tmpexists && die "should have refused to overwrite"
+# Bug: PR #972
+if [ "$?" -eq 139 ]; then
+  die "should not have segfaulted"
+fi
 rm tmp*


-
 $ECHO "\n===>  Advanced compression parameters "
 $ECHO "Hello world!" | $ZSTD --zstd=windowLog=21,      - -o tmp.zst && die "wrong parameters not detected!"
 $ECHO "Hello world!" | $ZSTD --zstd=windowLo=21        - -o tmp.zst && die "wrong parameters not detected!"
@ -203,8 +214,8 @@ roundTripTest -g512K
 roundTripTest -g512K " --zstd=slen=3,tlen=48,strat=6"
 roundTripTest -g512K " --zstd=strat=6,wlog=23,clog=23,hlog=22,slog=6"
 roundTripTest -g512K " --zstd=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6"
-roundTripTest -g512K " --long --zstd=ldmHashLog=20,ldmSearchLength=64,ldmBucketSizeLog=1,ldmHashEveryLog=7"
-roundTripTest -g512K " --long --zstd=ldmhlog=20,ldmslen=64,ldmblog=1,ldmhevery=7"
+roundTripTest -g512K " --single-thread --long --zstd=ldmHashLog=20,ldmSearchLength=64,ldmBucketSizeLog=1,ldmHashEveryLog=7"
+roundTripTest -g512K " --single-thread --long --zstd=ldmhlog=20,ldmslen=64,ldmblog=1,ldmhevery=7"
 roundTripTest -g512K 19


@ -231,8 +242,18 @@ $ZSTD -c hello.tmp > hello.zstd --no-check
 $ZSTD -c world.tmp > world.zstd --no-check
 cat hello.zstd world.zstd > helloworld.zstd
 $ZSTD -dc helloworld.zstd > result.tmp
-cat result.tmp
 $DIFF helloworld.tmp result.tmp
+$ECHO "testing zstdcat symlink"
+ln -sf $ZSTD zstdcat
+./zstdcat helloworld.zstd > result.tmp
+$DIFF helloworld.tmp result.tmp
+rm zstdcat
+rm result.tmp
+$ECHO "testing zcat symlink"
+ln -sf $ZSTD zcat
+./zcat helloworld.zstd > result.tmp
+$DIFF helloworld.tmp result.tmp
+rm zcat
 rm ./*.tmp ./*.zstd
 $ECHO "frame concatenation tests completed"

@ -322,9 +343,14 @@ $ECHO "- Create first dictionary "
 TESTFILE=../programs/zstdcli.c
 $ZSTD --train *.c ../programs/*.c -o tmpDict
 cp $TESTFILE tmp
+$ECHO "- Dictionary compression roundtrip"
 $ZSTD -f tmp -D tmpDict
 $ZSTD -d tmp.zst -D tmpDict -fo result
 $DIFF $TESTFILE result
+$ECHO "- Dictionary compression with btlazy2 strategy"
+$ZSTD -f tmp -D tmpDict --zstd=strategy=6
+$ZSTD -d tmp.zst -D tmpDict -fo result
+$DIFF $TESTFILE result
 if [ -n "$hasMT" ]
 then
    $ECHO "- Test dictionary compression with multithreading "
@ -451,6 +477,8 @@ $ECHO "bench one file"
 $ZSTD -bi0 tmp1
 $ECHO "bench multiple levels"
 $ZSTD -i0b0e3 tmp1
+$ECHO "bench negative level"
+$ZSTD -bi0 --fast tmp1
 $ECHO "with recursive and quiet modes"
 $ZSTD -rqi1b1e2 tmp1

@ -603,14 +631,15 @@ roundTripTest -g516K 19   # btopt
 fileRoundTripTest -g500K

 $ECHO "\n===>  zstd long distance matching round-trip tests "
-roundTripTest -g0 "2 --long"
-roundTripTest -g1000K "1 --long"
-roundTripTest -g517K "6 --long"
-roundTripTest -g516K "16 --long"
-roundTripTest -g518K "19 --long"
-fileRoundTripTest -g5M "3 --long"
+roundTripTest -g0 "2 --single-thread --long"
+roundTripTest -g1000K "1 --single-thread --long"
+roundTripTest -g517K "6 --single-thread --long"
+roundTripTest -g516K "16 --single-thread --long"
+roundTripTest -g518K "19 --single-thread --long"
+fileRoundTripTest -g5M "3 --single-thread --long"


+roundTripTest -g96K "5 --single-thread"
 if [ -n "$hasMT" ]
 then
    $ECHO "\n===>  zstdmt round-trip tests "
@ -620,7 +649,7 @@ then
    fileRoundTripTest -g4M "19 -T2 -B1M"

    $ECHO "\n===>  zstdmt long distance matching round-trip tests "
-    roundTripTest -g8M "3 --long -T2"
+    roundTripTest -g8M "3 --long=24 -T2"
 else
    $ECHO "\n===>  no multithreading, skipping zstdmt tests "
 fi
@ -671,13 +700,13 @@ rm tmp*


 $ECHO "\n===>   zstd long distance matching tests "
-roundTripTest -g0 " --long"
-roundTripTest -g9M "2 --long"
+roundTripTest -g0 " --single-thread --long"
+roundTripTest -g9M "2 --single-thread --long"
 # Test parameter parsing
-roundTripTest -g1M -P50 "1 --long=29" " --memory=512MB"
-roundTripTest -g1M -P50 "1 --long=29 --zstd=wlog=28" " --memory=256MB"
-roundTripTest -g1M -P50 "1 --long=29" " --long=28 --memory=512MB"
-roundTripTest -g1M -P50 "1 --long=29" " --zstd=wlog=28 --memory=512MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29" " --memory=512MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29 --zstd=wlog=28" " --memory=256MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29" " --long=28 --memory=512MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29" " --zstd=wlog=28 --memory=512MB"


 if [ "$1" != "--test-large-data" ]; then
@ -712,18 +741,19 @@ roundTripTest -g18000018 -P94 18
 roundTripTest -g18000019 -P96 19

 roundTripTest -g5000000000 -P99 1
+roundTripTest -g1700000000 -P0 "1 --zstd=strategy=6"   # ensure btlazy2 can survive an overflow rescale

 fileRoundTripTest -g4193M -P99 1


 $ECHO "\n===>   zstd long, long distance matching round-trip tests "
-roundTripTest -g270000000 "1 --long"
-roundTripTest -g130000000 -P60 "5 --long"
-roundTripTest -g35000000 -P70 "8 --long"
-roundTripTest -g18000001 -P80  "18 --long"
+roundTripTest -g270000000 "1 --single-thread --long"
+roundTripTest -g130000000 -P60 "5 --single-thread --long"
+roundTripTest -g35000000 -P70 "8 --single-thread --long"
+roundTripTest -g18000001 -P80  "18 --single-thread --long"
 # Test large window logs
-roundTripTest -g700M -P50 "1 --long=29"
-roundTripTest -g600M -P50 "1 --long --zstd=wlog=29,clog=28"
+roundTripTest -g700M -P50 "1 --single-thread --long=29"
+roundTripTest -g600M -P50 "1 --single-thread --long --zstd=wlog=29,clog=28"


 if [ -n "$hasMT" ]
--- a/sys/contrib/zstd/tests/roundTripCrash.c
+++ b/sys/contrib/zstd/tests/roundTripCrash.c
@ -94,7 +94,7 @@ static size_t cctxParamRoundTripTest(void* resultBuff, size_t resultBuffCapacity

    /* Set parameters */
    CHECK_Z( ZSTD_CCtxParam_setParameter(cctxParams, ZSTD_p_compressionLevel, cLevel) );
-    CHECK_Z( ZSTD_CCtxParam_setParameter(cctxParams, ZSTD_p_nbThreads, 2) );
+    CHECK_Z( ZSTD_CCtxParam_setParameter(cctxParams, ZSTD_p_nbWorkers, 2) );
    CHECK_Z( ZSTD_CCtxParam_setParameter(cctxParams, ZSTD_p_overlapSizeLog, 5) );


--- a/sys/contrib/zstd/tests/zstreamtest.c
+++ b/sys/contrib/zstd/tests/zstreamtest.c
@ -99,7 +99,7 @@ unsigned int FUZ_rand(unsigned int* seedPtr)
    if (cond) {                                              \
        DISPLAY("Error => ");                                \
        DISPLAY(__VA_ARGS__);                                \
-        DISPLAY(" (seed %u, test nb %u, line %u)  \n",       \
+        DISPLAY(" (seed %u, test nb %u, line %u) \n",        \
                seed, testNb, __LINE__);                     \
        goto _output_error;                                  \
 }   }
@ -219,6 +219,7 @@ static int basicUnitTests(U32 seed, double compressibility)
    size_t cSize;
    int testResult = 0;
    U32 testNb = 1;
+    U32 coreSeed = 0;  /* this name to conform with CHECK_Z macro display */
    ZSTD_CStream* zc = ZSTD_createCStream();
    ZSTD_DStream* zd = ZSTD_createDStream();
    ZSTDMT_CCtx* mtctx = ZSTDMT_createCCtx(2);
@ -238,7 +239,7 @@ static int basicUnitTests(U32 seed, double compressibility)

    /* Create dictionary */
    DISPLAYLEVEL(3, "creating dictionary for unit tests \n");
-    dictionary = FUZ_createDictionary(CNBuffer, CNBufferSize / 2, 8 KB, 40 KB);
+    dictionary = FUZ_createDictionary(CNBuffer, CNBufferSize / 3, 16 KB, 48 KB);
    if (!dictionary.start) {
        DISPLAY("Error creating dictionary, aborting \n");
        goto _output_error;
@ -372,7 +373,7 @@ static int basicUnitTests(U32 seed, double compressibility)
      DISPLAYLEVEL(3, "OK (%u bytes) \n", (U32)s);
    }

-    /* Byte-by-byte decompression test */
+    /* Decompression by small increment */
    DISPLAYLEVEL(3, "test%3i : decompress byte-by-byte : ", testNb++);
    {   /* skippable frame */
        size_t r = 1;
@ -382,8 +383,10 @@ static int basicUnitTests(U32 seed, double compressibility)
        inBuff.pos = 0;
        outBuff.pos = 0;
        while (r) {   /* skippable frame */
-            inBuff.size = inBuff.pos + 1;
-            outBuff.size = outBuff.pos + 1;
+            size_t const inSize = FUZ_rand(&coreSeed) & 15;
+            size_t const outSize = FUZ_rand(&coreSeed) & 15;
+            inBuff.size = inBuff.pos + inSize;
+            outBuff.size = outBuff.pos + outSize;
            r = ZSTD_decompressStream(zd, &outBuff, &inBuff);
            if (ZSTD_isError(r)) goto _output_error;
        }
@ -391,8 +394,10 @@ static int basicUnitTests(U32 seed, double compressibility)
        ZSTD_initDStream_usingDict(zd, CNBuffer, dictSize);
        r=1;
        while (r) {
-            inBuff.size = inBuff.pos + 1;
-            outBuff.size = outBuff.pos + 1;
+            size_t const inSize = FUZ_rand(&coreSeed) & 15;
+            size_t const outSize = FUZ_rand(&coreSeed) & 15;
+            inBuff.size = inBuff.pos + inSize;
+            outBuff.size = outBuff.pos + outSize;
            r = ZSTD_decompressStream(zd, &outBuff, &inBuff);
            if (ZSTD_isError(r)) goto _output_error;
        }
@ -427,8 +432,8 @@ static int basicUnitTests(U32 seed, double compressibility)
    DISPLAYLEVEL(3, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/COMPRESSIBLE_NOISE_LENGTH*100);

    /* wrong _srcSize compression test */
-    DISPLAYLEVEL(3, "test%3i : wrong srcSize : %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH-1);
-    ZSTD_initCStream_srcSize(zc, 1, CNBufferSize-1);
+    DISPLAYLEVEL(3, "test%3i : too large srcSize : %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH-1);
+    ZSTD_initCStream_srcSize(zc, 1, CNBufferSize+1);
    outBuff.dst = (char*)(compressedBuffer);
    outBuff.size = compressedBufferSize;
    outBuff.pos = 0;
@ -441,6 +446,20 @@ static int basicUnitTests(U32 seed, double compressibility)
      if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error;    /* must fail : wrong srcSize */
      DISPLAYLEVEL(3, "OK (error detected : %s) \n", ZSTD_getErrorName(r)); }

+    /* wrong _srcSize compression test */
+    DISPLAYLEVEL(3, "test%3i : too small srcSize : %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH-1);
+    ZSTD_initCStream_srcSize(zc, 1, CNBufferSize-1);
+    outBuff.dst = (char*)(compressedBuffer);
+    outBuff.size = compressedBufferSize;
+    outBuff.pos = 0;
+    inBuff.src = CNBuffer;
+    inBuff.size = CNBufferSize;
+    inBuff.pos = 0;
+    {   size_t const r = ZSTD_compressStream(zc, &outBuff, &inBuff);
+        if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error;    /* must fail : wrong srcSize */
+        DISPLAYLEVEL(3, "OK (error detected : %s) \n", ZSTD_getErrorName(r));
+    }
+
    /* Complex context re-use scenario */
    DISPLAYLEVEL(3, "test%3i : context re-use : ", testNb++);
    ZSTD_freeCStream(zc);
@ -557,11 +576,50 @@ static int basicUnitTests(U32 seed, double compressibility)
    { size_t const r = ZSTD_decompressStream(zd, &outBuff, &inBuff);
      if (!ZSTD_isError(r)) goto _output_error;  /* must fail : frame requires > 100 bytes */
      DISPLAYLEVEL(3, "OK (%s)\n", ZSTD_getErrorName(r)); }
+    ZSTD_DCtx_reset(zd);   /* leave zd in good shape for next tests */
+
+    DISPLAYLEVEL(3, "test%3i : dictionary source size and level : ", testNb++);
+    {   ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+        int const maxLevel = 16;   /* first level with zstd_opt */
+        int level;
+        assert(maxLevel < ZSTD_maxCLevel());
+        CHECK_Z( ZSTD_DCtx_loadDictionary_byReference(dctx, dictionary.start, dictionary.filled) );
+        for (level = 1; level <= maxLevel; ++level) {
+            ZSTD_CDict* const cdict = ZSTD_createCDict(dictionary.start, dictionary.filled, level);
+            size_t const maxSize = MIN(1 MB, CNBufferSize);
+            size_t size;
+            for (size = 512; size <= maxSize; size <<= 1) {
+                U64 const crcOrig = XXH64(CNBuffer, size, 0);
+                ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+                outBuff.dst = compressedBuffer;
+                outBuff.size = compressedBufferSize;
+                outBuff.pos = 0;
+                inBuff.src = CNBuffer;
+                inBuff.size = size;
+                inBuff.pos = 0;
+                CHECK_Z(ZSTD_CCtx_refCDict(cctx, cdict));
+                CHECK_Z(ZSTD_compress_generic(cctx, &outBuff, &inBuff, ZSTD_e_end));
+                if (inBuff.pos != inBuff.size) goto _output_error;
+                {   ZSTD_outBuffer decOut = {decodedBuffer, size, 0};
+                    ZSTD_inBuffer decIn = {outBuff.dst, outBuff.pos, 0};
+                    CHECK_Z( ZSTD_decompress_generic(dctx, &decOut, &decIn) );
+                    if (decIn.pos != decIn.size) goto _output_error;
+                    if (decOut.pos != size) goto _output_error;
+                    {   U64 const crcDec = XXH64(decOut.dst, decOut.pos, 0);
+                        if (crcDec != crcOrig) goto _output_error;
+                }   }
+                ZSTD_freeCCtx(cctx);
+            }
+            ZSTD_freeCDict(cdict);
+        }
+        ZSTD_freeDCtx(dctx);
+    }
+    DISPLAYLEVEL(3, "OK\n");

    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_usingCDict_advanced with masked dictID : ", testNb++);
    {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBufferSize, dictionary.filled);
        ZSTD_frameParameters const fParams = { 1 /* contentSize */, 1 /* checksum */, 1 /* noDictID */};
-        ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dm_auto, cParams, ZSTD_defaultCMem);
+        ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem);
        size_t const initError = ZSTD_initCStream_usingCDict_advanced(zc, cdict, fParams, CNBufferSize);
        if (ZSTD_isError(initError)) goto _output_error;
        cSize = 0;
@ -605,14 +663,18 @@ static int basicUnitTests(U32 seed, double compressibility)
    cSize = outBuff.pos;
    DISPLAYLEVEL(3, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBufferSize*100);

-    DISPLAYLEVEL(3, "test%3i : decompress with dictionary : ", testNb++);
-    {   size_t const r = ZSTD_decompress_usingDict(zd,
-                                        decodedBuffer, CNBufferSize,
-                                        compressedBuffer, cSize,
-                                        dictionary.start, dictionary.filled);
-        if (ZSTD_isError(r)) goto _output_error;  /* must fail : dictionary not used */
-        DISPLAYLEVEL(3, "OK \n");
-    }
+    DISPLAYLEVEL(3, "test%3i : decompress with ZSTD_DCtx_refPrefix : ", testNb++);
+    CHECK_Z( ZSTD_DCtx_refPrefix(zd, dictionary.start, dictionary.filled) );
+    outBuff.dst = decodedBuffer;
+    outBuff.size = CNBufferSize;
+    outBuff.pos = 0;
+    inBuff.src = compressedBuffer;
+    inBuff.size = cSize;
+    inBuff.pos = 0;
+    CHECK_Z( ZSTD_decompress_generic(zd, &outBuff, &inBuff) );
+    if (inBuff.pos != inBuff.size) goto _output_error;  /* entire input should be consumed */
+    if (outBuff.pos != CNBufferSize) goto _output_error;  /* must regenerate whole input */
+    DISPLAYLEVEL(3, "OK \n");

    DISPLAYLEVEL(3, "test%3i : decompress without dictionary (should fail): ", testNb++);
    {   size_t const r = ZSTD_decompress(decodedBuffer, CNBufferSize, compressedBuffer, cSize);
@ -694,16 +756,19 @@ static int basicUnitTests(U32 seed, double compressibility)
    inBuff.src = CNBuffer;
    inBuff.size = CNBufferSize;
    inBuff.pos = 0;
-    CHECK_Z( ZSTDMT_compressStream_generic(mtctx, &outBuff, &inBuff, ZSTD_e_end) );
+    {   size_t const compressResult = ZSTDMT_compressStream_generic(mtctx, &outBuff, &inBuff, ZSTD_e_end);
+        if (compressResult != 0) goto _output_error;  /* compression must be completed in a single round */
+    }
    if (inBuff.pos != inBuff.size) goto _output_error;   /* entire input should be consumed */
-    { size_t const r = ZSTDMT_endStream(mtctx, &outBuff);
-      if (r != 0) goto _output_error; }  /* error, or some data not flushed */
+    {   size_t const compressedSize = ZSTD_findFrameCompressedSize(compressedBuffer, outBuff.pos);
+        if (compressedSize != outBuff.pos) goto _output_error;  /* must be a full valid frame */
+    }
    DISPLAYLEVEL(3, "OK \n");

    /* Complex multithreading + dictionary test */
-    {   U32 const nbThreads = 2;
+    {   U32 const nbWorkers = 2;
        size_t const jobSize = 4 * 1 MB;
-        size_t const srcSize = jobSize * nbThreads;  /* we want each job to have predictable size */
+        size_t const srcSize = jobSize * nbWorkers;  /* we want each job to have predictable size */
        size_t const segLength = 2 KB;
        size_t const offset = 600 KB;   /* must be larger than window defined in cdict */
        size_t const start = jobSize + (offset-1);
@ -711,7 +776,7 @@ static int basicUnitTests(U32 seed, double compressibility)
        BYTE* const dst = (BYTE*)CNBuffer + start - offset;
        DISPLAYLEVEL(3, "test%3i : compress %u bytes with multiple threads + dictionary : ", testNb++, (U32)srcSize);
        CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_compressionLevel, 3) );
-        CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_nbThreads, 2) );
+        CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_nbWorkers, nbWorkers) );
        CHECK_Z( ZSTD_CCtx_setParameter(zc, ZSTD_p_jobSize, jobSize) );
        assert(start > offset);
        assert(start + segLength < COMPRESSIBLE_NOISE_LENGTH);
@ -724,7 +789,7 @@ static int basicUnitTests(U32 seed, double compressibility)
        inBuff.pos = 0;
    }
    {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, 4 KB, dictionary.filled);   /* intentionnally lies on estimatedSrcSize, to push cdict into targeting a small window size */
-        ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dm_fullDict, cParams, ZSTD_defaultCMem);
+        ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_fullDict, cParams, ZSTD_defaultCMem);
        DISPLAYLEVEL(5, "cParams.windowLog = %u : ", cParams.windowLog);
        CHECK_Z( ZSTD_CCtx_refCDict(zc, cdict) );
        CHECK_Z( ZSTD_compress_generic(zc, &outBuff, &inBuff, ZSTD_e_end) );
@ -767,7 +832,7 @@ static int basicUnitTests(U32 seed, double compressibility)

        XXH64_reset(&xxh, 0);
        cParams.windowLog = kMaxWindowLog;
-        cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dm_fullDict, cParams, ZSTD_defaultCMem);
+        cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, ZSTD_dlm_byRef, ZSTD_dct_fullDict, cParams, ZSTD_defaultCMem);
        ddict = ZSTD_createDDict(dictionary.start, dictionary.filled);

        if (!cdict || !ddict) goto _output_error;
@ -863,11 +928,21 @@ static size_t findDiff(const void* buf1, const void* buf2, size_t max)
    for (u=0; u<max; u++) {
        if (b1[u] != b2[u]) break;
    }
+    if (u==max) {
+        DISPLAY("=> No difference detected within %u bytes \n", (U32)max);
+        return u;
+    }
    DISPLAY("Error at position %u / %u \n", (U32)u, (U32)max);
-    DISPLAY(" %02X %02X %02X  :%02X:  %02X %02X %02X %02X %02X \n",
-            b1[u-3], b1[u-2], b1[u-1], b1[u-0], b1[u+1], b1[u+2], b1[u+3], b1[u+4], b1[u+5]);
-    DISPLAY(" %02X %02X %02X  :%02X:  %02X %02X %02X %02X %02X \n",
-            b2[u-3], b2[u-2], b2[u-1], b2[u-0], b2[u+1], b2[u+2], b2[u+3], b2[u+4], b2[u+5]);
+    if (u>=3)
+        DISPLAY(" %02X %02X %02X ",
+                b1[u-3], b1[u-2], b1[u-1]);
+    DISPLAY(" :%02X:  %02X %02X %02X %02X %02X \n",
+            b1[u], b1[u+1], b1[u+2], b1[u+3], b1[u+4], b1[u+5]);
+    if (u>=3)
+        DISPLAY(" %02X %02X %02X ",
+                b2[u-3], b2[u-2], b2[u-1]);
+    DISPLAY(" :%02X:  %02X %02X %02X %02X %02X \n",
+            b2[u], b2[u+1], b2[u+2], b2[u+3], b2[u+4], b2[u+5]);
    return u;
 }

@ -948,10 +1023,13 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compres
        size_t maxTestSize;

        /* init */
-        if (nbTests >= testNb) { DISPLAYUPDATE(2, "\r%6u/%6u    ", testNb, nbTests); }
-        else { DISPLAYUPDATE(2, "\r%6u          ", testNb); }
        FUZ_rand(&coreSeed);
        lseed = coreSeed ^ prime32;
+        if (nbTests >= testNb) {
+            DISPLAYUPDATE(2, "\r%6u/%6u    ", testNb, nbTests);
+        } else {
+            DISPLAYUPDATE(2, "\r%6u        ", testNb);
+        }

        /* states full reset (deliberately not synchronized) */
        /* some issues can only happen when reusing states */
@ -1161,7 +1239,6 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
    UTIL_time_t const startClock = UTIL_getTime();
    const BYTE* dict=NULL;   /* can keep same dict on 2 consecutive tests */
    size_t dictSize = 0;
-    U32 oldTestLog = 0;
    int const cLevelMax = bigTests ? (U32)ZSTD_maxCLevel()-1 : g_cLevelMax_smallTests;
    U32 const nbThreadsMax = bigTests ? 4 : 2;

@ -1183,6 +1260,7 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
    RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
    memset(copyBuffer, 0x65, copyBufferSize);                             /* make copyBuffer considered initialized */
    ZSTD_initDStream_usingDict(zd, NULL, 0);  /* ensure at least one init */
+    DISPLAYLEVEL(6, "Creating initial context with %u threads \n", nbThreads);

    /* catch up testNb */
    for (testNb=1; testNb < startTest; testNb++)
@ -1195,14 +1273,14 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
        size_t totalTestSize, totalGenSize, cSize;
        XXH64_state_t xxhState;
        U64 crcOrig;
-        U32 resetAllowed = 1;
        size_t maxTestSize;

-        /* init */
-        if (testNb < nbTests) {
-            DISPLAYUPDATE(2, "\r%6u/%6u    ", testNb, nbTests);
-        } else { DISPLAYUPDATE(2, "\r%6u          ", testNb); }
        FUZ_rand(&coreSeed);
+        if (nbTests >= testNb) {
+            DISPLAYUPDATE(2, "\r%6u/%6u    ", testNb, nbTests);
+        } else {
+            DISPLAYUPDATE(2, "\r%6u         ", testNb);
+        }
        lseed = coreSeed ^ prime32;

        /* states full reset (deliberately not synchronized) */
@ -1213,7 +1291,6 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
            ZSTDMT_freeCCtx(zc);
            zc = ZSTDMT_createCCtx(nbThreads);
            CHECK(zc==NULL, "ZSTDMT_createCCtx allocation error")
-            resetAllowed=0;
        }
        if ((FUZ_rand(&lseed) & 0xFF) == 132) {
            ZSTD_freeDStream(zd);
@ -1238,15 +1315,7 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
        }

        /* compression init */
-        if ((FUZ_rand(&lseed)&1) /* at beginning, to keep same nb of rand */
-            && oldTestLog /* at least one test happened */ && resetAllowed) {
-            maxTestSize = FUZ_randomLength(&lseed, oldTestLog+2);
-            if (maxTestSize >= srcBufferSize) maxTestSize = srcBufferSize-1;
-            {   int const compressionLevel = (FUZ_rand(&lseed) % 5) + 1;
-                CHECK_Z( ZSTDMT_initCStream(zc, compressionLevel) );
-            }
-        } else {
-            U32 const testLog = FUZ_rand(&lseed) % maxSrcLog;
+        {   U32 const testLog = FUZ_rand(&lseed) % maxSrcLog;
            U32 const dictLog = FUZ_rand(&lseed) % maxSrcLog;
            int const cLevelCandidate = ( FUZ_rand(&lseed)
                            % (ZSTD_maxCLevel() - (MAX(testLog, dictLog) / 2)) )
@ -1255,24 +1324,29 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
            int const cLevelMin = MAX(cLevelThreadAdjusted, 1);  /* no negative cLevel yet */
            int const cLevel = MIN(cLevelMin, cLevelMax);
            maxTestSize = FUZ_rLogLength(&lseed, testLog);
-            oldTestLog = testLog;
-            /* random dictionary selection */
-            dictSize  = ((FUZ_rand(&lseed)&63)==1) ? FUZ_rLogLength(&lseed, dictLog) : 0;
-            {   size_t const dictStart = FUZ_rand(&lseed) % (srcBufferSize - dictSize);
-                dict = srcBuffer + dictStart;
-            }
-            {   U64 const pledgedSrcSize = (FUZ_rand(&lseed) & 3) ? ZSTD_CONTENTSIZE_UNKNOWN : maxTestSize;
-                ZSTD_parameters params = ZSTD_getParams(cLevel, pledgedSrcSize, dictSize);
-                DISPLAYLEVEL(5, "Init with windowLog = %u, pledgedSrcSize = %u, dictSize = %u \n",
-                    params.cParams.windowLog, (U32)pledgedSrcSize, (U32)dictSize);
-                params.fParams.checksumFlag = FUZ_rand(&lseed) & 1;
-                params.fParams.noDictIDFlag = FUZ_rand(&lseed) & 1;
-                params.fParams.contentSizeFlag = FUZ_rand(&lseed) & 1;
-                DISPLAYLEVEL(5, "checksumFlag : %u \n", params.fParams.checksumFlag);
-                CHECK_Z( ZSTDMT_setMTCtxParameter(zc, ZSTDMT_p_overlapSectionLog, FUZ_rand(&lseed) % 12) );
-                CHECK_Z( ZSTDMT_setMTCtxParameter(zc, ZSTDMT_p_jobSize, FUZ_rand(&lseed) % (2*maxTestSize+1)) );   /* custome job size */
-                CHECK_Z( ZSTDMT_initCStream_advanced(zc, dict, dictSize, params, pledgedSrcSize) );
-        }   }
+
+            if (FUZ_rand(&lseed)&1) {   /* simple init */
+                int const compressionLevel = (FUZ_rand(&lseed) % 5) + 1;
+                DISPLAYLEVEL(5, "Init with compression level = %i \n", compressionLevel);
+                CHECK_Z( ZSTDMT_initCStream(zc, compressionLevel) );
+            } else {   /* advanced init */
+                /* random dictionary selection */
+                dictSize  = ((FUZ_rand(&lseed)&63)==1) ? FUZ_rLogLength(&lseed, dictLog) : 0;
+                {   size_t const dictStart = FUZ_rand(&lseed) % (srcBufferSize - dictSize);
+                    dict = srcBuffer + dictStart;
+                }
+                {   U64 const pledgedSrcSize = (FUZ_rand(&lseed) & 3) ? ZSTD_CONTENTSIZE_UNKNOWN : maxTestSize;
+                    ZSTD_parameters params = ZSTD_getParams(cLevel, pledgedSrcSize, dictSize);
+                    DISPLAYLEVEL(5, "Init with windowLog = %u, pledgedSrcSize = %u, dictSize = %u \n",
+                        params.cParams.windowLog, (U32)pledgedSrcSize, (U32)dictSize);
+                    params.fParams.checksumFlag = FUZ_rand(&lseed) & 1;
+                    params.fParams.noDictIDFlag = FUZ_rand(&lseed) & 1;
+                    params.fParams.contentSizeFlag = FUZ_rand(&lseed) & 1;
+                    DISPLAYLEVEL(5, "checksumFlag : %u \n", params.fParams.checksumFlag);
+                    CHECK_Z( ZSTDMT_setMTCtxParameter(zc, ZSTDMT_p_overlapSectionLog, FUZ_rand(&lseed) % 12) );
+                    CHECK_Z( ZSTDMT_setMTCtxParameter(zc, ZSTDMT_p_jobSize, FUZ_rand(&lseed) % (2*maxTestSize+1)) );   /* custome job size */
+                    CHECK_Z( ZSTDMT_initCStream_advanced(zc, dict, dictSize, params, pledgedSrcSize) );
+        }   }   }

        /* multi-segments compression test */
        XXH64_reset(&xxhState, 0);
@ -1301,9 +1375,12 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
                if ((FUZ_rand(&lseed) & 15) == 0) {
                    size_t const randomDstSize = FUZ_randomLength(&lseed, maxSampleLog);
                    size_t const adjustedDstSize = MIN(cBufferSize - cSize, randomDstSize);
+                    size_t const previousPos = outBuff.pos;
                    outBuff.size = outBuff.pos + adjustedDstSize;
                    DISPLAYLEVEL(5, "Flushing into dst buffer of size %u \n", (U32)adjustedDstSize);
                    CHECK_Z( ZSTDMT_flushStream(zc, &outBuff) );
+                    assert(outBuff.pos >= previousPos);
+                    DISPLAYLEVEL(6, "%u bytes flushed by ZSTDMT_flushStream \n", (U32)(outBuff.pos-previousPos));
            }   }

            /* final frame epilogue */
@ -1311,18 +1388,24 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
                while (remainingToFlush) {
                    size_t const randomDstSize = FUZ_randomLength(&lseed, maxSampleLog);
                    size_t const adjustedDstSize = MIN(cBufferSize - cSize, randomDstSize);
+                    size_t const previousPos = outBuff.pos;
                    outBuff.size = outBuff.pos + adjustedDstSize;
                    DISPLAYLEVEL(5, "Ending into dst buffer of size %u \n", (U32)adjustedDstSize);
                    remainingToFlush = ZSTDMT_endStream(zc, &outBuff);
                    CHECK (ZSTD_isError(remainingToFlush), "ZSTDMT_endStream error : %s", ZSTD_getErrorName(remainingToFlush));
+                    assert(outBuff.pos >= previousPos);
+                    DISPLAYLEVEL(6, "%u bytes flushed by ZSTDMT_endStream \n", (U32)(outBuff.pos-previousPos));
                    DISPLAYLEVEL(5, "endStream : remainingToFlush : %u \n", (U32)remainingToFlush);
            }   }
            crcOrig = XXH64_digest(&xxhState);
            cSize = outBuff.pos;
-            DISPLAYLEVEL(5, "Frame completed : %u bytes \n", (U32)cSize);
+            DISPLAYLEVEL(5, "Frame completed : %u bytes compressed into %u bytes \n",
+                            (U32)totalTestSize, (U32)cSize);
        }

        /* multi - fragments decompression test */
+        assert(totalTestSize < dstBufferSize);
+        memset(dstBuffer, 170, totalTestSize);   /* init dest area */
        if (!dictSize /* don't reset if dictionary : could be different */ && (FUZ_rand(&lseed) & 1)) {
            CHECK_Z( ZSTD_resetDStream(zd) );
        } else {
@ -1337,10 +1420,16 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
                size_t const dstBuffSize = MIN(dstBufferSize - totalGenSize, randomDstSize);
                inBuff.size = inBuff.pos + readCSrcSize;
                outBuff.size = outBuff.pos + dstBuffSize;
-                DISPLAYLEVEL(6, "ZSTD_decompressStream input %u bytes \n", (U32)readCSrcSize);
+                DISPLAYLEVEL(6, "ZSTD_decompressStream input %u bytes into outBuff %u bytes \n",
+                                (U32)readCSrcSize, (U32)dstBuffSize);
                decompressionResult = ZSTD_decompressStream(zd, &outBuff, &inBuff);
+                if (ZSTD_isError(decompressionResult)) {
+                    DISPLAY("ZSTD_decompressStream error : %s \n", ZSTD_getErrorName(decompressionResult));
+                    findDiff(copyBuffer, dstBuffer, totalTestSize);
+                }
                CHECK (ZSTD_isError(decompressionResult), "decompression error : %s", ZSTD_getErrorName(decompressionResult));
-                DISPLAYLEVEL(6, "inBuff.pos = %u \n", (U32)readCSrcSize);
+                DISPLAYLEVEL(6, "total ingested (inBuff.pos) = %u and produced (outBuff.pos) = %u \n",
+                                (U32)inBuff.pos, (U32)outBuff.pos);
            }
            CHECK (outBuff.pos != totalTestSize, "decompressed data : wrong size (%u != %u)", (U32)outBuff.pos, (U32)totalTestSize);
            CHECK (inBuff.pos != cSize, "compressed data should be fully read (%u != %u)", (U32)inBuff.pos, (U32)cSize);
@ -1579,7 +1668,11 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                }

                /* mess with frame parameters */
-                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_checksumFlag, FUZ_rand(&lseed) & 1, useOpaqueAPI) );
+                if (FUZ_rand(&lseed) & 1) {
+                    U32 const checksumFlag = FUZ_rand(&lseed) & 1;
+                    DISPLAYLEVEL(5, "t%u: frame checksum : %u \n", testNb, checksumFlag);
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_checksumFlag, checksumFlag, useOpaqueAPI) );
+                }
                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_dictIDFlag, FUZ_rand(&lseed) & 1, useOpaqueAPI) );
                if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_contentSizeFlag, FUZ_rand(&lseed) & 1, useOpaqueAPI) );
                if (FUZ_rand(&lseed) & 1) {
@ -1592,7 +1685,7 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                    U32 const nbThreadsAdjusted = (windowLogMalus < nbThreadsCandidate) ? nbThreadsCandidate - windowLogMalus : 1;
                    U32 const nbThreads = MIN(nbThreadsAdjusted, nbThreadsMax);
                    DISPLAYLEVEL(5, "t%u: nbThreads : %u \n", testNb, nbThreads);
-                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_nbThreads, nbThreads, useOpaqueAPI) );
+                    CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_nbWorkers, nbThreads, useOpaqueAPI) );
                    if (nbThreads > 1) {
                        U32 const jobLog = FUZ_rand(&lseed) % (testLog+1);
                        CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_p_overlapSizeLog, FUZ_rand(&lseed) % 10, useOpaqueAPI) );
@ -1644,8 +1737,8 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                outBuff.size = outBuff.pos + dstBuffSize;

                CHECK_Z( ZSTD_compress_generic(zc, &outBuff, &inBuff, flush) );
-                DISPLAYLEVEL(6, "t%u: compress consumed %u bytes (total : %u) \n",
-                    testNb, (U32)inBuff.pos, (U32)(totalTestSize + inBuff.pos));
+                DISPLAYLEVEL(6, "t%u: compress consumed %u bytes (total : %u) ; flush: %u (total : %u) \n",
+                    testNb, (U32)inBuff.pos, (U32)(totalTestSize + inBuff.pos), (U32)flush, (U32)outBuff.pos);

                XXH64_update(&xxhState, srcBuffer+srcStart, inBuff.pos);
                memcpy(copyBuffer+totalTestSize, srcBuffer+srcStart, inBuff.pos);
@ -1653,14 +1746,15 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
            }

            /* final frame epilogue */
-            {   size_t remainingToFlush = (size_t)(-1);
+            {   size_t remainingToFlush = 1;
                while (remainingToFlush) {
                    ZSTD_inBuffer inBuff = { NULL, 0, 0 };
                    size_t const randomDstSize = FUZ_randomLength(&lseed, maxSampleLog+1);
                    size_t const adjustedDstSize = MIN(cBufferSize - cSize, randomDstSize);
                    outBuff.size = outBuff.pos + adjustedDstSize;
-                    DISPLAYLEVEL(6, "End-flush into dst buffer of size %u \n", (U32)adjustedDstSize);
+                    DISPLAYLEVEL(6, "t%u: End-flush into dst buffer of size %u \n", testNb, (U32)adjustedDstSize);
                    remainingToFlush = ZSTD_compress_generic(zc, &outBuff, &inBuff, ZSTD_e_end);
+                    DISPLAYLEVEL(6, "t%u: Total flushed so far : %u bytes \n", testNb, (U32)outBuff.pos);
                    CHECK( ZSTD_isError(remainingToFlush),
                          "ZSTD_compress_generic w/ ZSTD_e_end error : %s",
                           ZSTD_getErrorName(remainingToFlush) );
@ -1687,14 +1781,20 @@ static int fuzzerTests_newAPI(U32 seed, U32 nbTests, unsigned startTest, double
                size_t const dstBuffSize = MIN(dstBufferSize - totalGenSize, randomDstSize);
                inBuff.size = inBuff.pos + readCSrcSize;
                outBuff.size = outBuff.pos + dstBuffSize;
-                DISPLAYLEVEL(6, "ZSTD_decompressStream input %u bytes (pos:%u/%u)\n",
-                            (U32)readCSrcSize, (U32)inBuff.pos, (U32)cSize);
+                DISPLAYLEVEL(6, "decompression presented %u new bytes (pos:%u/%u)\n",
+                                (U32)readCSrcSize, (U32)inBuff.pos, (U32)cSize);
                decompressionResult = ZSTD_decompressStream(zd, &outBuff, &inBuff);
+                DISPLAYLEVEL(6, "so far: consumed = %u, produced = %u \n",
+                                (U32)inBuff.pos, (U32)outBuff.pos);
+                if (ZSTD_isError(decompressionResult)) {
+                    DISPLAY("ZSTD_decompressStream error : %s \n", ZSTD_getErrorName(decompressionResult));
+                    findDiff(copyBuffer, dstBuffer, totalTestSize);
+                }
                CHECK (ZSTD_isError(decompressionResult), "decompression error : %s", ZSTD_getErrorName(decompressionResult));
-                DISPLAYLEVEL(6, "inBuff.pos = %u \n", (U32)readCSrcSize);
+                CHECK (inBuff.pos > cSize, "ZSTD_decompressStream consumes too much input : %u > %u ", (U32)inBuff.pos, (U32)cSize);
            }
-            CHECK (outBuff.pos != totalTestSize, "decompressed data : wrong size (%u != %u)", (U32)outBuff.pos, (U32)totalTestSize);
            CHECK (inBuff.pos != cSize, "compressed data should be fully read (%u != %u)", (U32)inBuff.pos, (U32)cSize);
+            CHECK (outBuff.pos != totalTestSize, "decompressed data : wrong size (%u != %u)", (U32)outBuff.pos, (U32)totalTestSize);
            {   U64 const crcDest = XXH64(dstBuffer, totalTestSize, 0);
                if (crcDest!=crcOrig) findDiff(copyBuffer, dstBuffer, totalTestSize);
                CHECK (crcDest!=crcOrig, "decompressed data corrupted");
--- a/sys/contrib/zstd/zlibWrapper/examples/zwrapbench.c
+++ b/sys/contrib/zstd/zlibWrapper/examples/zwrapbench.c
@ -233,7 +233,7 @@ static int BMK_benchMem(z_const void* srcBuffer, size_t srcSize,
                if (compressor == BMK_ZSTD) {
                    ZSTD_parameters const zparams = ZSTD_getParams(cLevel, avgSize, dictBufferSize);
                    ZSTD_customMem const cmem = { NULL, NULL, NULL };
-                    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dm_auto, zparams.cParams, cmem);
+                    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_auto, zparams.cParams, cmem);
                    if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict_advanced() allocation failure");

                    do {